From 74be4427a9fd532c02f0630da6eff8125652e626 Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Wed, 3 Jun 2026 20:55:53 -0400
Subject: [PATCH 1/7] cudax/stf: migrate internal/ context + resources from
 cuda_safe_call to cuda_try

Second slice of the internal/ migration, covering the stream-lifetime /
resource-handle files. Two sites get real transactional handling:

- ctx_resource.cuh: hold the batched callback resource list in a
  unique_ptr and only release() it after cudaStreamAddCallback succeeds,
  so a throw from the enqueue no longer leaks the list. The enqueue uses
  the templated cuda_try<cudaStreamAddCallback> form.

- slice.cuh: pin() is all-or-nothing, but the 2D/3D paths pin several
  regions in a loop. Record each pinned base pointer and roll them back
  via SCOPE(fail) if a later pin_memory throws, leaving the slice
  unpinned (matching the address_is_pinned early-return that treats the
  base address as a proxy for the whole slice). pin_memory stays in the
  runtime-status form (it is an overload-set template); unpin_memory only
  aborts, never throws, so it is safe inside the noexcept SCOPE(fail).
  Adds <vector> and scope_guard.cuh includes.

The remaining conversions are mechanical:

- async_resources_handle.cuh: cudaGraphGetNodes / cudaGraphGetEdges keep
  the runtime-status form (multiple output pointers, one passed as
  nullptr).
- context.cuh: UNITTEST bodies switch to cuda_try, using the templated
  cuda_try<F> form where applicable (cudaStreamCreate, cudaSetDevice,
  cudaStreamSynchronize/Destroy) and ::std::ignore for the discarded
  cudaGraphAddEmptyNode handle.
---
 .../__stf/internal/async_resources_handle.cuh |  6 ++--
 .../experimental/__stf/internal/context.cuh   | 30 +++++++---------
 .../__stf/internal/ctx_resource.cuh           |  9 +++--
 .../experimental/__stf/internal/slice.cuh     | 34 +++++++++++++++----
 4 files changed, 48 insertions(+), 31 deletions(-)
diff --git a/cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh b/cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh
index 4cf56af9dd1..767a7d3ce95 100644
--- a/cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh
@@ -235,11 +235,11 @@ public:
     size_t nedges;
     size_t nnodes;
 
-    cuda_safe_call(cudaGraphGetNodes(g, nullptr, &nnodes));
+    cuda_try(cudaGraphGetNodes(g, nullptr, &nnodes));
 #if _CCCL_CTK_AT_LEAST(13, 0)
-    cuda_safe_call(cudaGraphGetEdges(g, nullptr, nullptr, nullptr, &nedges));
+    cuda_try(cudaGraphGetEdges(g, nullptr, nullptr, nullptr, &nedges));
 #else // _CCCL_CTK_AT_LEAST(13, 0)
-    cuda_safe_call(cudaGraphGetEdges(g, nullptr, nullptr, &nedges));
+    cuda_try(cudaGraphGetEdges(g, nullptr, nullptr, &nedges));
 #endif // _CCCL_CTK_AT_LEAST(13, 0)
 
     _CCCL_ASSERT(pimpl, "async_resources_handle is not initialized");
diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh
index f99c0115c33..50d66b3599f 100644
--- a/cudax/include/cuda/experimental/__stf/internal/context.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh
@@ -1121,18 +1121,17 @@ UNITTEST("context resources released on finalize non blocking")
     }
   };
 
-  cudaStream_t stream;
-  cuda_safe_call(cudaStreamCreate(&stream));
+  const cudaStream_t stream = cuda_try<cudaStreamCreate>();
 
   bool released = false;
   context ctx(stream, async_resources_handle());
   ctx.add_resource(::std::make_shared<dummy_released_resource>(&released));
   ctx.finalize(); // non-blocking: context was created with user stream
   EXPECT(!released); // not yet, callback not run
-  cuda_safe_call(cudaStreamSynchronize(stream));
+  cuda_try<cudaStreamSynchronize>(stream);
   EXPECT(released);
 
-  cuda_safe_call(cudaStreamDestroy(stream));
+  cuda_try<cudaStreamDestroy>(stream);
 };
 
 UNITTEST("context import_resources_from")
@@ -1182,8 +1181,7 @@ UNITTEST("context graph and stage")
 
 UNITTEST("context with arguments")
 {
-  cudaStream_t stream;
-  cuda_safe_call(cudaStreamCreate(&stream));
+  const cudaStream_t stream = cuda_try<cudaStreamCreate>();
 
   async_resources_handle h;
 
@@ -1199,7 +1197,7 @@ UNITTEST("context with arguments")
   context ctx4 = graph_ctx(stream, h);
   ctx4.finalize();
 
-  cuda_safe_call(cudaStreamDestroy(stream));
+  cuda_try<cudaStreamDestroy>(stream);
 };
 
 #  if !defined(CUDASTF_DISABLE_CODE_GENERATION) && _CCCL_CUDA_COMPILATION()
@@ -1706,8 +1704,7 @@ UNITTEST("make_tuple_indexwise")
 
 UNITTEST("cuda stream place")
 {
-  cudaStream_t user_stream;
-  cuda_safe_call(cudaStreamCreate(&user_stream));
+  const cudaStream_t user_stream = cuda_try<cudaStreamCreate>();
 
   context ctx;
 
@@ -1726,16 +1723,14 @@ UNITTEST("cuda stream place")
 
 UNITTEST("cuda stream place multi-gpu")
 {
-  cudaStream_t user_stream;
-
   // Create a CUDA stream in a different device (if available)
-  int ndevices = cuda_try<cudaGetDeviceCount>();
+  const int ndevices = cuda_try<cudaGetDeviceCount>();
   // use the last device
-  int target_dev_id = ndevices - 1;
+  const int target_dev_id = ndevices - 1;
 
-  cuda_safe_call(cudaSetDevice(target_dev_id));
-  cuda_safe_call(cudaStreamCreate(&user_stream));
-  cuda_safe_call(cudaSetDevice(0));
+  cuda_try<cudaSetDevice>(target_dev_id);
+  const cudaStream_t user_stream = cuda_try<cudaStreamCreate>();
+  cuda_try<cudaSetDevice>(0);
 
   context ctx;
 
@@ -1826,8 +1821,7 @@ UNITTEST("get_stream graph")
   cudaStream_t s = t.get_stream();
   // We are not capturing so there is no stream associated
   EXPECT(s == nullptr);
-  cudaGraphNode_t n;
-  cuda_safe_call(cudaGraphAddEmptyNode(&n, t.get_graph(), nullptr, 0));
+  ::std::ignore = cuda_try<cudaGraphAddEmptyNode>(t.get_graph(), nullptr, 0);
   t.end();
 
   auto t2 = ctx.task(token.rw());
diff --git a/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh b/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh
index d0cd3831b24..03229face07 100644
--- a/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh
@@ -106,8 +106,10 @@ public:
     // Batch all callback resources into a single host callback for efficiency
     if (!callback_resources.empty())
     {
-      // Transfer ownership of callback resources to the callback
-      auto* callback_list = new ::std::vector<::std::shared_ptr<ctx_resource>>(mv(callback_resources));
+      // Transfer ownership of callback resources to the callback. Held in a
+      // unique_ptr until the callback is successfully enqueued so a throw from
+      // cudaStreamAddCallback does not leak the list.
+      auto callback_list = ::std::make_unique<::std::vector<::std::shared_ptr<ctx_resource>>>(mv(callback_resources));
 
       // Add a single host callback using lambda that will release all callback resources
       auto release_lambda = [](cudaStream_t /*stream*/, cudaError_t /*status*/, void* userData) -> void {
@@ -123,7 +125,8 @@ public:
         delete resources;
       };
 
-      cuda_safe_call(cudaStreamAddCallback(stream, release_lambda, callback_list, 0));
+      cuda_try<cudaStreamAddCallback>(stream, release_lambda, callback_list.get(), 0);
+      callback_list.release();
     }
 
     // Mark as released to prevent double release
diff --git a/cudax/include/cuda/experimental/__stf/internal/slice.cuh b/cudax/include/cuda/experimental/__stf/internal/slice.cuh
index ffa39b678f4..bc61edafbfd 100644
--- a/cudax/include/cuda/experimental/__stf/internal/slice.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/slice.cuh
@@ -31,8 +31,10 @@
 #include <cuda/experimental/__stf/utility/dimensions.cuh>
 #include <cuda/experimental/__stf/utility/hash.cuh>
 #include <cuda/experimental/__stf/utility/memory.cuh>
+#include <cuda/experimental/__stf/utility/scope_guard.cuh>
 
 #include <iostream>
+#include <vector>
 
 namespace cuda::experimental::stf
 {
@@ -820,13 +822,31 @@ bool pin(mdspan<T, P...>& s)
     return false;
   }
 
+  // pin() is all-or-nothing: if any region fails to pin, roll back the ones we
+  // already pinned so the slice is left unpinned (matching the early-return
+  // above, which treats the base address as a proxy for the whole slice).
+  // unpin_memory never throws (it aborts at most), so it is safe in SCOPE(fail).
+  ::std::vector<T*> pinned;
+  SCOPE(fail)
+  {
+    for (T* p : pinned)
+    {
+      unpin_memory(p);
+    }
+  };
+
+  const auto pin_one = [&pinned](T* ptr, size_t n) {
+    cuda_try(pin_memory(ptr, n));
+    pinned.push_back(ptr);
+  };
+
   if constexpr (rank == 0)
   {
-    cuda_safe_call(pin_memory(s.data_handle(), 1));
+    pin_one(s.data_handle(), 1);
   }
   else if constexpr (rank == 1)
   {
-    cuda_safe_call(pin_memory(s.data_handle(), s.extent(0)));
+    pin_one(s.data_handle(), s.extent(0));
   }
   else if constexpr (rank == 2)
   {
@@ -835,12 +855,12 @@ bool pin(mdspan<T, P...>& s)
       case 1:
         for (size_t index_1 = 0; index_1 < s.extent(1); index_1++)
         {
-          cuda_safe_call(pin_memory(&s(0, index_1) + index_1 * s.stride(1), s.extent(0)));
+          pin_one(&s(0, index_1) + index_1 * s.stride(1), s.extent(0));
         }
         break;
       case 2:
         // fprintf(stderr, "PIN 2D - contiguous\n");
-        cuda_safe_call(pin_memory(s.data_handle(), s.extent(0) * s.extent(1)));
+        pin_one(s.data_handle(), s.extent(0) * s.extent(1));
         break;
       default:
         assert(false);
@@ -858,19 +878,19 @@ bool pin(mdspan<T, P...>& s)
           for (size_t index_1 = 0; index_1 < s.extent(1); index_1++)
           {
             // fprintf(stderr, "ADDR %d,%d,0 = %p \n", index_2, index_1, &s(index_2, index_1, 0));
-            cuda_safe_call(pin_memory(&s(0, index_1, index_2), s.extent(0)));
+            pin_one(&s(0, index_1, index_2), s.extent(0));
           }
         }
         break;
       case 2:
         for (size_t index_2 = 0; index_2 < s.extent(2); index_2++)
         {
-          cuda_safe_call(pin_memory(&s(0, 0, index_2), s.extent(0) * s.extent(1)));
+          pin_one(&s(0, 0, index_2), s.extent(0) * s.extent(1));
         }
         break;
       case 3:
         // fprintf(stderr, "PIN 3D - contiguous\n");
-        cuda_safe_call(pin_memory(s.data_handle(), s.extent(0) * s.extent(1) * s.extent(2)));
+        pin_one(s.data_handle(), s.extent(0) * s.extent(1) * s.extent(2));
         break;
       default:
         assert(false);

From 50b86a1d3ed824a03a7d336b25501be954a564b2 Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Wed, 3 Jun 2026 21:04:59 -0400
Subject: [PATCH 2/7] cudax/stf: use templated cuda_try<> last-output form for
 graph node/edge counts

cudaGraphGetNodes / cudaGraphGetEdges have multiple output pointers, but
passing nullptr for the ones we don't need leaves a single synthesized
output (numNodes / numEdges) as the last parameter, so cuda_try's
last-output form applies:

  const size_t nnodes = cuda_try<cudaGraphGetNodes>(g, nullptr);
  const size_t nedges = cuda_try<cudaGraphGetEdges>(g, nullptr, nullptr[, nullptr]);

This also lets the counts be const and drops the separate declarations.
---
 .../__stf/internal/async_resources_handle.cuh            | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh b/cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh
index 767a7d3ce95..54ac94bf40e 100644
--- a/cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh
@@ -232,14 +232,11 @@ public:
 
   ::cuda::std::pair<::std::shared_ptr<cudaGraphExec_t>, bool> cached_graphs_query(cudaGraph_t g)
   {
-    size_t nedges;
-    size_t nnodes;
-
-    cuda_try(cudaGraphGetNodes(g, nullptr, &nnodes));
+    const size_t nnodes = cuda_try<cudaGraphGetNodes>(g, nullptr);
 #if _CCCL_CTK_AT_LEAST(13, 0)
-    cuda_try(cudaGraphGetEdges(g, nullptr, nullptr, nullptr, &nedges));
+    const size_t nedges = cuda_try<cudaGraphGetEdges>(g, nullptr, nullptr, nullptr);
 #else // _CCCL_CTK_AT_LEAST(13, 0)
-    cuda_try(cudaGraphGetEdges(g, nullptr, nullptr, &nedges));
+    const size_t nedges = cuda_try<cudaGraphGetEdges>(g, nullptr, nullptr);
 #endif // _CCCL_CTK_AT_LEAST(13, 0)
 
     _CCCL_ASSERT(pimpl, "async_resources_handle is not initialized");

From cfbd028c2466ef99c4d2501ee30264d01a5d4d36 Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Wed, 3 Jun 2026 21:57:48 -0400
Subject: [PATCH 3/7] cudax/stf: make ctx_resource_set::release stream-resource
 removal retryable

Replace the null-then-erase pass with an in-place swap-from-back
compaction: each stream-dependent resource is released and then removed
by moving the last element into its slot and popping. A resource leaves
`resources` only after release(stream) succeeds, so if it throws, the
vector still holds the failing resource plus everything not yet
processed -- release() can be retried with nothing lost and no
double-release. No null slots linger, so no separate erase pass (and no
<algorithm>) is needed.
---
 .../__stf/internal/ctx_resource.cuh           | 37 ++++++++++---------
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh b/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh
index 03229face07..7f8ad21c4c7 100644
--- a/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh
@@ -87,42 +87,43 @@ public:
   {
     _CCCL_ASSERT(!resources_released, "Resources have already been released on this context");
 
-    // Separate resources into stream-dependent and callback-batched
-    decltype(resources) callback_resources;
-
-    for (auto& r : resources)
+    // Release stream-dependent resources and compact them out of `resources` by
+    // pulling the last element into each vacated slot. A resource leaves
+    // `resources` only after it has been released, so if release(stream) throws,
+    // `resources` still holds the failing resource plus everything not yet
+    // processed -- release() can be retried with nothing lost or double-released.
+    for (size_t i = 0; i < resources.size();)
     {
-      if (r->can_release_in_callback())
-      {
-        callback_resources.push_back(mv(r));
-      }
-      else
+      if (resources[i]->can_release_in_callback())
       {
-        r->release(stream);
+        ++i;
+        continue;
       }
+      resources[i]->release(stream); // may throw -> resources[i] stays in place
+      resources[i] = mv(resources.back());
+      resources.pop_back();
     }
-    resources.clear();
 
-    // Batch all callback resources into a single host callback for efficiency
-    if (!callback_resources.empty())
+    if (!resources.empty())
     {
       // Transfer ownership of callback resources to the callback. Held in a
       // unique_ptr until the callback is successfully enqueued so a throw from
       // cudaStreamAddCallback does not leak the list.
-      auto callback_list = ::std::make_unique<::std::vector<::std::shared_ptr<ctx_resource>>>(mv(callback_resources));
+      auto callback_list = ::std::make_unique<::std::vector<::std::shared_ptr<ctx_resource>>>(mv(resources));
 
       // Add a single host callback using lambda that will release all callback resources
       auto release_lambda = [](cudaStream_t /*stream*/, cudaError_t /*status*/, void* userData) -> void {
-        auto* resources = static_cast<::std::vector<::std::shared_ptr<ctx_resource>>*>(userData);
+        auto* resources = static_cast<decltype(callback_list.get())>(userData);
+        SCOPE(exit)
+        {
+          delete resources;
+        };
 
         // Release all callback resources
         for (auto& resource : *resources)
         {
           resource->release_in_callback();
         }
-
-        // Clean up the callback list itself
-        delete resources;
       };
 
       cuda_try<cudaStreamAddCallback>(stream, release_lambda, callback_list.get(), 0);

From 1c6cd4c4b298cd62ccf978e57a6ccca32293161c Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Wed, 3 Jun 2026 23:27:55 -0400
Subject: [PATCH 4/7] cudax/stf: simplify slice pin/unpin via a single
 contiguous-hunk walk

Replace the rank-0/1/2/3 x contiguous_dims switch matrix in both pin()
and unpin() with one recursive helper, reserved::for_each_contiguous_hunk,
that invokes f(base, n) for each maximal contiguous hunk: the leading
contiguous_dims() dims form one run, the trailing dims are enumerated
recursively. This covers every element exactly once, handles any rank
(drops the old rank <= 3 limit), and normalizes the odd address math in
the previous 2D paths (which double-counted the stride/extent).

  - unpin() is now just a walk calling unpin_memory on each hunk base.
    unpin_memory ignores not-registered regions, so this is safe on
    fully- or partially-pinned slices.
  - pin()'s rollback is simply SCOPE(fail) { unpin(s); } -- no separate
    `pinned` vector, and no <vector> include. On a mid-walk failure
    unpin() releases the hunks we pinned and no-ops the rest, leaving the
    slice fully unpinned, consistent with the address_is_pinned() proxy.

pin_memory stays in the runtime-status cuda_try(pin_memory(...)) form
(it is an overload-set template, so cuda_try<pin_memory> cannot name it).
The helper is C++17 (recursive generic lambda) and lives in `reserved`.
---
 .../experimental/__stf/internal/slice.cuh     | 200 ++++++------------
 1 file changed, 61 insertions(+), 139 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/slice.cuh b/cudax/include/cuda/experimental/__stf/internal/slice.cuh
index bc61edafbfd..af97157da32 100644
--- a/cudax/include/cuda/experimental/__stf/internal/slice.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/slice.cuh
@@ -34,7 +34,6 @@
 #include <cuda/experimental/__stf/utility/scope_guard.cuh>
 
 #include <iostream>
-#include <vector>
 
 namespace cuda::experimental::stf
 {
@@ -805,101 +804,46 @@ UNITTEST("3D slice should be similar to 3D mdspan", (slice<double, 3>()))
 #endif // UNITTESTED_FILE
 
 /**
- * @brief Pins a slice in host memory for efficient use with CUDA primitives
+ * @brief Invokes `f(base, n)` once for each maximal contiguous hunk of `s`.
  *
- * @tparam T memory type
- * @tparam dimensions slice dimension
- * @param s slice to pin
+ * `base` points to the first element of the hunk and `n` is the number of
+ * contiguous elements in it. The leading `contiguous_dims(s)` dimensions form a
+ * single contiguous run; the remaining dimensions are enumerated recursively,
+ * yielding one hunk per index tuple. Together the hunks cover every element of
+ * `s` exactly once, assuming the STF slice convention that dimension 0 is
+ * unit-stride.
  */
-template <typename T, typename... P>
-bool pin(mdspan<T, P...>& s)
+namespace reserved
+{
+template <typename T, typename... P, typename F>
+void for_each_contiguous_hunk(const mdspan<T, P...>& s, F&& f)
 {
-  // We need the rank as a constexpr value
   constexpr auto rank = mdspan<T, P...>::extents_type::rank();
+  const size_t c      = contiguous_dims(s);
 
-  if (address_is_pinned(s.data_handle()))
+  // The contiguous prefix [0, c) is a single run of this many elements.
+  size_t hunk = 1;
+  for (size_t d = 0; d < c; ++d)
   {
-    return false;
+    hunk *= s.extent(d);
   }
 
-  // pin() is all-or-nothing: if any region fails to pin, roll back the ones we
-  // already pinned so the slice is left unpinned (matching the early-return
-  // above, which treats the base address as a proxy for the whole slice).
-  // unpin_memory never throws (it aborts at most), so it is safe in SCOPE(fail).
-  ::std::vector<T*> pinned;
-  SCOPE(fail)
-  {
-    for (T* p : pinned)
-    {
-      unpin_memory(p);
-    }
-  };
-
-  const auto pin_one = [&pinned](T* ptr, size_t n) {
-    cuda_try(pin_memory(ptr, n));
-    pinned.push_back(ptr);
-  };
-
-  if constexpr (rank == 0)
-  {
-    pin_one(s.data_handle(), 1);
-  }
-  else if constexpr (rank == 1)
-  {
-    pin_one(s.data_handle(), s.extent(0));
-  }
-  else if constexpr (rank == 2)
-  {
-    switch (contiguous_dims(s))
+  // Walk the Cartesian product of the trailing, non-contiguous dims [c, rank);
+  // each index tuple is the base of one contiguous hunk.
+  auto rec = [&](auto&& self, T* base, size_t dim) -> void {
+    if (dim == rank)
     {
-      case 1:
-        for (size_t index_1 = 0; index_1 < s.extent(1); index_1++)
-        {
-          pin_one(&s(0, index_1) + index_1 * s.stride(1), s.extent(0));
-        }
-        break;
-      case 2:
-        // fprintf(stderr, "PIN 2D - contiguous\n");
-        pin_one(s.data_handle(), s.extent(0) * s.extent(1));
-        break;
-      default:
-        assert(false);
-        abort();
+      f(base, hunk);
+      return;
     }
-  }
-  else
-  {
-    static_assert(rank == 3, "Dimensionality not supported.");
-    switch (contiguous_dims(s))
+    for (size_t i = 0; i < s.extent(dim); ++i)
     {
-      case 1:
-        for (size_t index_2 = 0; index_2 < s.extent(2); index_2++)
-        {
-          for (size_t index_1 = 0; index_1 < s.extent(1); index_1++)
-          {
-            // fprintf(stderr, "ADDR %d,%d,0 = %p \n", index_2, index_1, &s(index_2, index_1, 0));
-            pin_one(&s(0, index_1, index_2), s.extent(0));
-          }
-        }
-        break;
-      case 2:
-        for (size_t index_2 = 0; index_2 < s.extent(2); index_2++)
-        {
-          pin_one(&s(0, 0, index_2), s.extent(0) * s.extent(1));
-        }
-        break;
-      case 3:
-        // fprintf(stderr, "PIN 3D - contiguous\n");
-        pin_one(s.data_handle(), s.extent(0) * s.extent(1) * s.extent(2));
-        break;
-      default:
-        assert(false);
-        abort();
+      self(self, base + i * s.stride(dim), dim + 1);
     }
-  }
-
-  return true;
+  };
+  rec(rec, s.data_handle(), c);
 }
+} // namespace reserved
 
 /**
  * @brief Unpin the memory associated with an mdspan object.
@@ -907,70 +851,48 @@ bool pin(mdspan<T, P...>& s)
  * @tparam T The type of elements in the mdspan.
  * @tparam P The properties of the mdspan.
  * @param s The mdspan object to unpin memory for.
+ *
+ * `unpin_memory` silently ignores regions that are not currently registered, so
+ * this is safe on fully- *or* partially-pinned slices (e.g. a `pin()` that
+ * failed partway and rolled back, or an already-unpinned slice).
  */
 template <typename T, typename... P>
 void unpin(mdspan<T, P...>& s)
 {
-  // We need the rank as a constexpr value
-  constexpr auto rank = mdspan<T, P...>::extents_type::rank();
+  reserved::for_each_contiguous_hunk(s, [](T* base, size_t /*n*/) {
+    unpin_memory(base);
+  });
+}
 
-  if constexpr (rank == 0)
-  {
-    unpin_memory(s.data_handle());
-  }
-  else if constexpr (rank == 1)
-  {
-    unpin_memory(s.data_handle());
-  }
-  else if constexpr (rank == 2)
+/**
+ * @brief Pins a slice in host memory for efficient use with CUDA primitives
+ *
+ * @tparam T memory type
+ * @tparam P slice properties
+ * @param s slice to pin
+ * @return true if the slice was newly pinned, false if it was already pinned
+ */
+template <typename T, typename... P>
+bool pin(mdspan<T, P...>& s)
+{
+  if (address_is_pinned(s.data_handle()))
   {
-    switch (contiguous_dims(s))
-    {
-      case 1:
-        for (size_t index_1 = 0; index_1 < s.extent(1); index_1++)
-        {
-          unpin_memory(&s(0, index_1) + index_1 * s.extent(0));
-        }
-        break;
-      case 2:
-        // fprintf(stderr, "PIN 2D - contiguous\n");
-        unpin_memory(s.data_handle());
-        break;
-      default:
-        assert(false);
-        abort();
-    }
+    return false;
   }
-  else
+
+  // Roll back on any failure. unpin() tolerates hunks that were never pinned
+  // (the one that threw, plus the ones we never reached), so this leaves the
+  // slice fully unpinned -- consistent with the address_is_pinned() proxy above.
+  SCOPE(fail)
   {
-    static_assert(rank == 3, "Dimensionality not supported.");
-    switch (contiguous_dims(s))
-    {
-      case 1:
-        for (size_t index_2 = 0; index_2 < s.extent(2); index_2++)
-        {
-          for (size_t index_1 = 0; index_1 < s.extent(1); index_1++)
-          {
-            // fprintf(stderr, "ADDR %d,%d,0 = %p \n", index_2, index_1, &s(index_2, index_1, 0));
-            unpin_memory(&s(0, index_1, index_2));
-          }
-        }
-        break;
-      case 2:
-        for (size_t index_2 = 0; index_2 < s.extent(2); index_2++)
-        {
-          unpin_memory(&s(0, 0, index_2));
-        }
-        break;
-      case 3:
-        // fprintf(stderr, "PIN 3D - contiguous\n");
-        unpin_memory(s.data_handle());
-        break;
-      default:
-        assert(false);
-        abort();
-    }
-  }
+    unpin(s);
+  };
+
+  reserved::for_each_contiguous_hunk(s, [](T* base, size_t n) {
+    cuda_try(pin_memory(base, n));
+  });
+
+  return true;
 }
 
 _CCCL_DIAG_PUSH

From d37dbfe8e46d6ee66b068c98c922b9e0d8a512d0 Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Wed, 3 Jun 2026 23:30:09 -0400
Subject: [PATCH 5/7] cudax/stf: destroy test streams via SCOPE(exit) in
 context.cuh unit tests

In the two UNITTESTs that create a user stream, release it through a
SCOPE(exit) guard right after creation instead of a trailing
cudaStreamDestroy, so the stream is destroyed even if the test body
throws. cuda_safe_call is used inside the noexcept SCOPE(exit) body.
---
 .../cuda/experimental/__stf/internal/context.cuh     | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh
index 50d66b3599f..b84b5dfcbfa 100644
--- a/cudax/include/cuda/experimental/__stf/internal/context.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh
@@ -1122,6 +1122,10 @@ UNITTEST("context resources released on finalize non blocking")
   };
 
   const cudaStream_t stream = cuda_try<cudaStreamCreate>();
+  SCOPE(exit)
+  {
+    cuda_safe_call(cudaStreamDestroy(stream));
+  };
 
   bool released = false;
   context ctx(stream, async_resources_handle());
@@ -1130,8 +1134,6 @@ UNITTEST("context resources released on finalize non blocking")
   EXPECT(!released); // not yet, callback not run
   cuda_try<cudaStreamSynchronize>(stream);
   EXPECT(released);
-
-  cuda_try<cudaStreamDestroy>(stream);
 };
 
 UNITTEST("context import_resources_from")
@@ -1182,6 +1184,10 @@ UNITTEST("context graph and stage")
 UNITTEST("context with arguments")
 {
   const cudaStream_t stream = cuda_try<cudaStreamCreate>();
+  SCOPE(exit)
+  {
+    cuda_safe_call(cudaStreamDestroy(stream));
+  };
 
   async_resources_handle h;
 
@@ -1196,8 +1202,6 @@ UNITTEST("context with arguments")
 
   context ctx4 = graph_ctx(stream, h);
   ctx4.finalize();
-
-  cuda_try<cudaStreamDestroy>(stream);
 };
 
 #  if !defined(CUDASTF_DISABLE_CODE_GENERATION) && _CCCL_CUDA_COMPILATION()

From 7089769922c32e1ad3e0ddfbfb442464561e57ad Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Thu, 4 Jun 2026 00:58:56 -0400
Subject: [PATCH 6/7] cudax/stf: check the cudaMemcpyAsync in a context unit
 test

The cudaMemcpyAsync in this UNITTEST task body was unchecked; wrap it in
cuda_safe_call so a failure is reported. (cuda_safe_call, not cuda_try:
this runs inside the task-body callback where throwing is not safe.)
---
 cudax/include/cuda/experimental/__stf/internal/context.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh
index b84b5dfcbfa..3e7468618fc 100644
--- a/cudax/include/cuda/experimental/__stf/internal/context.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh
@@ -1566,7 +1566,7 @@ UNITTEST("context task")
 
   ctx.task(la.read(), lb.write())->*[](auto s, auto a, auto b) {
     // no-op
-    cudaMemcpyAsync(&b(0), &a(0), sizeof(int), cudaMemcpyDeviceToDevice, s);
+    cuda_safe_call(cudaMemcpyAsync(&b(0), &a(0), sizeof(int), cudaMemcpyDeviceToDevice, s));
   };
 
   ctx.finalize();

From 2d42937940faa290cfd2a16976b16d64a042246f Mon Sep 17 00:00:00 2001
From: Andrei Alexandrescu <andrei@erdani.com>
Date: Thu, 4 Jun 2026 18:22:31 -0400
Subject: [PATCH 7/7] cudax/stf: include scope_guard.cuh in ctx_resource.cuh

ctx_resource.cuh uses SCOPE(exit) in the resource-release callback but only
included core.cuh and cuda_safe_call.cuh, so SCOPE was undefined. Because
ctx_resource.cuh is widely included, this broke the cudax build across the
matrix. Add the missing scope_guard.cuh include.
---
 cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh b/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh
index 7f8ad21c4c7..79eaa4a804f 100644
--- a/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh
+++ b/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh
@@ -17,6 +17,7 @@
 
 #include <cuda/experimental/__stf/utility/core.cuh>
 #include <cuda/experimental/__stf/utility/cuda_safe_call.cuh>
+#include <cuda/experimental/__stf/utility/scope_guard.cuh>
 
 #include <functional>
 #include <memory>