From 74be4427a9fd532c02f0630da6eff8125652e626 Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Wed, 3 Jun 2026 20:55:53 -0400 Subject: [PATCH 1/7] cudax/stf: migrate internal/ context + resources from cuda_safe_call to cuda_try Second slice of the internal/ migration, covering the stream-lifetime / resource-handle files. Two sites get real transactional handling: - ctx_resource.cuh: hold the batched callback resource list in a unique_ptr and only release() it after cudaStreamAddCallback succeeds, so a throw from the enqueue no longer leaks the list. The enqueue uses the templated cuda_try form. - slice.cuh: pin() is all-or-nothing, but the 2D/3D paths pin several regions in a loop. Record each pinned base pointer and roll them back via SCOPE(fail) if a later pin_memory throws, leaving the slice unpinned (matching the address_is_pinned early-return that treats the base address as a proxy for the whole slice). pin_memory stays in the runtime-status form (it is an overload-set template); unpin_memory only aborts, never throws, so it is safe inside the noexcept SCOPE(fail). Adds and scope_guard.cuh includes. The remaining conversions are mechanical: - async_resources_handle.cuh: cudaGraphGetNodes / cudaGraphGetEdges keep the runtime-status form (multiple output pointers, one passed as nullptr). - context.cuh: UNITTEST bodies switch to cuda_try, using the templated cuda_try form where applicable (cudaStreamCreate, cudaSetDevice, cudaStreamSynchronize/Destroy) and ::std::ignore for the discarded cudaGraphAddEmptyNode handle. --- .../__stf/internal/async_resources_handle.cuh | 6 ++-- .../experimental/__stf/internal/context.cuh | 30 +++++++--------- .../__stf/internal/ctx_resource.cuh | 9 +++-- .../experimental/__stf/internal/slice.cuh | 34 +++++++++++++++---- 4 files changed, 48 insertions(+), 31 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh b/cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh index 4cf56af9dd1..767a7d3ce95 100644 --- a/cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh @@ -235,11 +235,11 @@ public: size_t nedges; size_t nnodes; - cuda_safe_call(cudaGraphGetNodes(g, nullptr, &nnodes)); + cuda_try(cudaGraphGetNodes(g, nullptr, &nnodes)); #if _CCCL_CTK_AT_LEAST(13, 0) - cuda_safe_call(cudaGraphGetEdges(g, nullptr, nullptr, nullptr, &nedges)); + cuda_try(cudaGraphGetEdges(g, nullptr, nullptr, nullptr, &nedges)); #else // _CCCL_CTK_AT_LEAST(13, 0) - cuda_safe_call(cudaGraphGetEdges(g, nullptr, nullptr, &nedges)); + cuda_try(cudaGraphGetEdges(g, nullptr, nullptr, &nedges)); #endif // _CCCL_CTK_AT_LEAST(13, 0) _CCCL_ASSERT(pimpl, "async_resources_handle is not initialized"); diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh index f99c0115c33..50d66b3599f 100644 --- a/cudax/include/cuda/experimental/__stf/internal/context.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh @@ -1121,18 +1121,17 @@ UNITTEST("context resources released on finalize non blocking") } }; - cudaStream_t stream; - cuda_safe_call(cudaStreamCreate(&stream)); + const cudaStream_t stream = cuda_try(); bool released = false; context ctx(stream, async_resources_handle()); ctx.add_resource(::std::make_shared(&released)); ctx.finalize(); // non-blocking: context was created with user stream EXPECT(!released); // not yet, callback not run - cuda_safe_call(cudaStreamSynchronize(stream)); + cuda_try(stream); EXPECT(released); - cuda_safe_call(cudaStreamDestroy(stream)); + cuda_try(stream); }; UNITTEST("context import_resources_from") @@ -1182,8 +1181,7 @@ UNITTEST("context graph and stage") UNITTEST("context with arguments") { - cudaStream_t stream; - cuda_safe_call(cudaStreamCreate(&stream)); + const cudaStream_t stream = cuda_try(); async_resources_handle h; @@ -1199,7 +1197,7 @@ UNITTEST("context with arguments") context ctx4 = graph_ctx(stream, h); ctx4.finalize(); - cuda_safe_call(cudaStreamDestroy(stream)); + cuda_try(stream); }; # if !defined(CUDASTF_DISABLE_CODE_GENERATION) && _CCCL_CUDA_COMPILATION() @@ -1706,8 +1704,7 @@ UNITTEST("make_tuple_indexwise") UNITTEST("cuda stream place") { - cudaStream_t user_stream; - cuda_safe_call(cudaStreamCreate(&user_stream)); + const cudaStream_t user_stream = cuda_try(); context ctx; @@ -1726,16 +1723,14 @@ UNITTEST("cuda stream place") UNITTEST("cuda stream place multi-gpu") { - cudaStream_t user_stream; - // Create a CUDA stream in a different device (if available) - int ndevices = cuda_try(); + const int ndevices = cuda_try(); // use the last device - int target_dev_id = ndevices - 1; + const int target_dev_id = ndevices - 1; - cuda_safe_call(cudaSetDevice(target_dev_id)); - cuda_safe_call(cudaStreamCreate(&user_stream)); - cuda_safe_call(cudaSetDevice(0)); + cuda_try(target_dev_id); + const cudaStream_t user_stream = cuda_try(); + cuda_try(0); context ctx; @@ -1826,8 +1821,7 @@ UNITTEST("get_stream graph") cudaStream_t s = t.get_stream(); // We are not capturing so there is no stream associated EXPECT(s == nullptr); - cudaGraphNode_t n; - cuda_safe_call(cudaGraphAddEmptyNode(&n, t.get_graph(), nullptr, 0)); + ::std::ignore = cuda_try(t.get_graph(), nullptr, 0); t.end(); auto t2 = ctx.task(token.rw()); diff --git a/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh b/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh index d0cd3831b24..03229face07 100644 --- a/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh @@ -106,8 +106,10 @@ public: // Batch all callback resources into a single host callback for efficiency if (!callback_resources.empty()) { - // Transfer ownership of callback resources to the callback - auto* callback_list = new ::std::vector<::std::shared_ptr>(mv(callback_resources)); + // Transfer ownership of callback resources to the callback. Held in a + // unique_ptr until the callback is successfully enqueued so a throw from + // cudaStreamAddCallback does not leak the list. + auto callback_list = ::std::make_unique<::std::vector<::std::shared_ptr>>(mv(callback_resources)); // Add a single host callback using lambda that will release all callback resources auto release_lambda = [](cudaStream_t /*stream*/, cudaError_t /*status*/, void* userData) -> void { @@ -123,7 +125,8 @@ public: delete resources; }; - cuda_safe_call(cudaStreamAddCallback(stream, release_lambda, callback_list, 0)); + cuda_try(stream, release_lambda, callback_list.get(), 0); + callback_list.release(); } // Mark as released to prevent double release diff --git a/cudax/include/cuda/experimental/__stf/internal/slice.cuh b/cudax/include/cuda/experimental/__stf/internal/slice.cuh index ffa39b678f4..bc61edafbfd 100644 --- a/cudax/include/cuda/experimental/__stf/internal/slice.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/slice.cuh @@ -31,8 +31,10 @@ #include #include #include +#include #include +#include namespace cuda::experimental::stf { @@ -820,13 +822,31 @@ bool pin(mdspan& s) return false; } + // pin() is all-or-nothing: if any region fails to pin, roll back the ones we + // already pinned so the slice is left unpinned (matching the early-return + // above, which treats the base address as a proxy for the whole slice). + // unpin_memory never throws (it aborts at most), so it is safe in SCOPE(fail). + ::std::vector pinned; + SCOPE(fail) + { + for (T* p : pinned) + { + unpin_memory(p); + } + }; + + const auto pin_one = [&pinned](T* ptr, size_t n) { + cuda_try(pin_memory(ptr, n)); + pinned.push_back(ptr); + }; + if constexpr (rank == 0) { - cuda_safe_call(pin_memory(s.data_handle(), 1)); + pin_one(s.data_handle(), 1); } else if constexpr (rank == 1) { - cuda_safe_call(pin_memory(s.data_handle(), s.extent(0))); + pin_one(s.data_handle(), s.extent(0)); } else if constexpr (rank == 2) { @@ -835,12 +855,12 @@ bool pin(mdspan& s) case 1: for (size_t index_1 = 0; index_1 < s.extent(1); index_1++) { - cuda_safe_call(pin_memory(&s(0, index_1) + index_1 * s.stride(1), s.extent(0))); + pin_one(&s(0, index_1) + index_1 * s.stride(1), s.extent(0)); } break; case 2: // fprintf(stderr, "PIN 2D - contiguous\n"); - cuda_safe_call(pin_memory(s.data_handle(), s.extent(0) * s.extent(1))); + pin_one(s.data_handle(), s.extent(0) * s.extent(1)); break; default: assert(false); @@ -858,19 +878,19 @@ bool pin(mdspan& s) for (size_t index_1 = 0; index_1 < s.extent(1); index_1++) { // fprintf(stderr, "ADDR %d,%d,0 = %p \n", index_2, index_1, &s(index_2, index_1, 0)); - cuda_safe_call(pin_memory(&s(0, index_1, index_2), s.extent(0))); + pin_one(&s(0, index_1, index_2), s.extent(0)); } } break; case 2: for (size_t index_2 = 0; index_2 < s.extent(2); index_2++) { - cuda_safe_call(pin_memory(&s(0, 0, index_2), s.extent(0) * s.extent(1))); + pin_one(&s(0, 0, index_2), s.extent(0) * s.extent(1)); } break; case 3: // fprintf(stderr, "PIN 3D - contiguous\n"); - cuda_safe_call(pin_memory(s.data_handle(), s.extent(0) * s.extent(1) * s.extent(2))); + pin_one(s.data_handle(), s.extent(0) * s.extent(1) * s.extent(2)); break; default: assert(false); From 50b86a1d3ed824a03a7d336b25501be954a564b2 Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Wed, 3 Jun 2026 21:04:59 -0400 Subject: [PATCH 2/7] cudax/stf: use templated cuda_try<> last-output form for graph node/edge counts cudaGraphGetNodes / cudaGraphGetEdges have multiple output pointers, but passing nullptr for the ones we don't need leaves a single synthesized output (numNodes / numEdges) as the last parameter, so cuda_try's last-output form applies: const size_t nnodes = cuda_try(g, nullptr); const size_t nedges = cuda_try(g, nullptr, nullptr[, nullptr]); This also lets the counts be const and drops the separate declarations. --- .../__stf/internal/async_resources_handle.cuh | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh b/cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh index 767a7d3ce95..54ac94bf40e 100644 --- a/cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/async_resources_handle.cuh @@ -232,14 +232,11 @@ public: ::cuda::std::pair<::std::shared_ptr, bool> cached_graphs_query(cudaGraph_t g) { - size_t nedges; - size_t nnodes; - - cuda_try(cudaGraphGetNodes(g, nullptr, &nnodes)); + const size_t nnodes = cuda_try(g, nullptr); #if _CCCL_CTK_AT_LEAST(13, 0) - cuda_try(cudaGraphGetEdges(g, nullptr, nullptr, nullptr, &nedges)); + const size_t nedges = cuda_try(g, nullptr, nullptr, nullptr); #else // _CCCL_CTK_AT_LEAST(13, 0) - cuda_try(cudaGraphGetEdges(g, nullptr, nullptr, &nedges)); + const size_t nedges = cuda_try(g, nullptr, nullptr); #endif // _CCCL_CTK_AT_LEAST(13, 0) _CCCL_ASSERT(pimpl, "async_resources_handle is not initialized"); From cfbd028c2466ef99c4d2501ee30264d01a5d4d36 Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Wed, 3 Jun 2026 21:57:48 -0400 Subject: [PATCH 3/7] cudax/stf: make ctx_resource_set::release stream-resource removal retryable Replace the null-then-erase pass with an in-place swap-from-back compaction: each stream-dependent resource is released and then removed by moving the last element into its slot and popping. A resource leaves `resources` only after release(stream) succeeds, so if it throws, the vector still holds the failing resource plus everything not yet processed -- release() can be retried with nothing lost and no double-release. No null slots linger, so no separate erase pass (and no ) is needed. --- .../__stf/internal/ctx_resource.cuh | 37 ++++++++++--------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh b/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh index 03229face07..7f8ad21c4c7 100644 --- a/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh @@ -87,42 +87,43 @@ public: { _CCCL_ASSERT(!resources_released, "Resources have already been released on this context"); - // Separate resources into stream-dependent and callback-batched - decltype(resources) callback_resources; - - for (auto& r : resources) + // Release stream-dependent resources and compact them out of `resources` by + // pulling the last element into each vacated slot. A resource leaves + // `resources` only after it has been released, so if release(stream) throws, + // `resources` still holds the failing resource plus everything not yet + // processed -- release() can be retried with nothing lost or double-released. + for (size_t i = 0; i < resources.size();) { - if (r->can_release_in_callback()) - { - callback_resources.push_back(mv(r)); - } - else + if (resources[i]->can_release_in_callback()) { - r->release(stream); + ++i; + continue; } + resources[i]->release(stream); // may throw -> resources[i] stays in place + resources[i] = mv(resources.back()); + resources.pop_back(); } - resources.clear(); - // Batch all callback resources into a single host callback for efficiency - if (!callback_resources.empty()) + if (!resources.empty()) { // Transfer ownership of callback resources to the callback. Held in a // unique_ptr until the callback is successfully enqueued so a throw from // cudaStreamAddCallback does not leak the list. - auto callback_list = ::std::make_unique<::std::vector<::std::shared_ptr>>(mv(callback_resources)); + auto callback_list = ::std::make_unique<::std::vector<::std::shared_ptr>>(mv(resources)); // Add a single host callback using lambda that will release all callback resources auto release_lambda = [](cudaStream_t /*stream*/, cudaError_t /*status*/, void* userData) -> void { - auto* resources = static_cast<::std::vector<::std::shared_ptr>*>(userData); + auto* resources = static_cast(userData); + SCOPE(exit) + { + delete resources; + }; // Release all callback resources for (auto& resource : *resources) { resource->release_in_callback(); } - - // Clean up the callback list itself - delete resources; }; cuda_try(stream, release_lambda, callback_list.get(), 0); From 1c6cd4c4b298cd62ccf978e57a6ccca32293161c Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Wed, 3 Jun 2026 23:27:55 -0400 Subject: [PATCH 4/7] cudax/stf: simplify slice pin/unpin via a single contiguous-hunk walk Replace the rank-0/1/2/3 x contiguous_dims switch matrix in both pin() and unpin() with one recursive helper, reserved::for_each_contiguous_hunk, that invokes f(base, n) for each maximal contiguous hunk: the leading contiguous_dims() dims form one run, the trailing dims are enumerated recursively. This covers every element exactly once, handles any rank (drops the old rank <= 3 limit), and normalizes the odd address math in the previous 2D paths (which double-counted the stride/extent). - unpin() is now just a walk calling unpin_memory on each hunk base. unpin_memory ignores not-registered regions, so this is safe on fully- or partially-pinned slices. - pin()'s rollback is simply SCOPE(fail) { unpin(s); } -- no separate `pinned` vector, and no include. On a mid-walk failure unpin() releases the hunks we pinned and no-ops the rest, leaving the slice fully unpinned, consistent with the address_is_pinned() proxy. pin_memory stays in the runtime-status cuda_try(pin_memory(...)) form (it is an overload-set template, so cuda_try cannot name it). The helper is C++17 (recursive generic lambda) and lives in `reserved`. --- .../experimental/__stf/internal/slice.cuh | 200 ++++++------------ 1 file changed, 61 insertions(+), 139 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/slice.cuh b/cudax/include/cuda/experimental/__stf/internal/slice.cuh index bc61edafbfd..af97157da32 100644 --- a/cudax/include/cuda/experimental/__stf/internal/slice.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/slice.cuh @@ -34,7 +34,6 @@ #include #include -#include namespace cuda::experimental::stf { @@ -805,101 +804,46 @@ UNITTEST("3D slice should be similar to 3D mdspan", (slice())) #endif // UNITTESTED_FILE /** - * @brief Pins a slice in host memory for efficient use with CUDA primitives + * @brief Invokes `f(base, n)` once for each maximal contiguous hunk of `s`. * - * @tparam T memory type - * @tparam dimensions slice dimension - * @param s slice to pin + * `base` points to the first element of the hunk and `n` is the number of + * contiguous elements in it. The leading `contiguous_dims(s)` dimensions form a + * single contiguous run; the remaining dimensions are enumerated recursively, + * yielding one hunk per index tuple. Together the hunks cover every element of + * `s` exactly once, assuming the STF slice convention that dimension 0 is + * unit-stride. */ -template -bool pin(mdspan& s) +namespace reserved +{ +template +void for_each_contiguous_hunk(const mdspan& s, F&& f) { - // We need the rank as a constexpr value constexpr auto rank = mdspan::extents_type::rank(); + const size_t c = contiguous_dims(s); - if (address_is_pinned(s.data_handle())) + // The contiguous prefix [0, c) is a single run of this many elements. + size_t hunk = 1; + for (size_t d = 0; d < c; ++d) { - return false; + hunk *= s.extent(d); } - // pin() is all-or-nothing: if any region fails to pin, roll back the ones we - // already pinned so the slice is left unpinned (matching the early-return - // above, which treats the base address as a proxy for the whole slice). - // unpin_memory never throws (it aborts at most), so it is safe in SCOPE(fail). - ::std::vector pinned; - SCOPE(fail) - { - for (T* p : pinned) - { - unpin_memory(p); - } - }; - - const auto pin_one = [&pinned](T* ptr, size_t n) { - cuda_try(pin_memory(ptr, n)); - pinned.push_back(ptr); - }; - - if constexpr (rank == 0) - { - pin_one(s.data_handle(), 1); - } - else if constexpr (rank == 1) - { - pin_one(s.data_handle(), s.extent(0)); - } - else if constexpr (rank == 2) - { - switch (contiguous_dims(s)) + // Walk the Cartesian product of the trailing, non-contiguous dims [c, rank); + // each index tuple is the base of one contiguous hunk. + auto rec = [&](auto&& self, T* base, size_t dim) -> void { + if (dim == rank) { - case 1: - for (size_t index_1 = 0; index_1 < s.extent(1); index_1++) - { - pin_one(&s(0, index_1) + index_1 * s.stride(1), s.extent(0)); - } - break; - case 2: - // fprintf(stderr, "PIN 2D - contiguous\n"); - pin_one(s.data_handle(), s.extent(0) * s.extent(1)); - break; - default: - assert(false); - abort(); + f(base, hunk); + return; } - } - else - { - static_assert(rank == 3, "Dimensionality not supported."); - switch (contiguous_dims(s)) + for (size_t i = 0; i < s.extent(dim); ++i) { - case 1: - for (size_t index_2 = 0; index_2 < s.extent(2); index_2++) - { - for (size_t index_1 = 0; index_1 < s.extent(1); index_1++) - { - // fprintf(stderr, "ADDR %d,%d,0 = %p \n", index_2, index_1, &s(index_2, index_1, 0)); - pin_one(&s(0, index_1, index_2), s.extent(0)); - } - } - break; - case 2: - for (size_t index_2 = 0; index_2 < s.extent(2); index_2++) - { - pin_one(&s(0, 0, index_2), s.extent(0) * s.extent(1)); - } - break; - case 3: - // fprintf(stderr, "PIN 3D - contiguous\n"); - pin_one(s.data_handle(), s.extent(0) * s.extent(1) * s.extent(2)); - break; - default: - assert(false); - abort(); + self(self, base + i * s.stride(dim), dim + 1); } - } - - return true; + }; + rec(rec, s.data_handle(), c); } +} // namespace reserved /** * @brief Unpin the memory associated with an mdspan object. @@ -907,70 +851,48 @@ bool pin(mdspan& s) * @tparam T The type of elements in the mdspan. * @tparam P The properties of the mdspan. * @param s The mdspan object to unpin memory for. + * + * `unpin_memory` silently ignores regions that are not currently registered, so + * this is safe on fully- *or* partially-pinned slices (e.g. a `pin()` that + * failed partway and rolled back, or an already-unpinned slice). */ template void unpin(mdspan& s) { - // We need the rank as a constexpr value - constexpr auto rank = mdspan::extents_type::rank(); + reserved::for_each_contiguous_hunk(s, [](T* base, size_t /*n*/) { + unpin_memory(base); + }); +} - if constexpr (rank == 0) - { - unpin_memory(s.data_handle()); - } - else if constexpr (rank == 1) - { - unpin_memory(s.data_handle()); - } - else if constexpr (rank == 2) +/** + * @brief Pins a slice in host memory for efficient use with CUDA primitives + * + * @tparam T memory type + * @tparam P slice properties + * @param s slice to pin + * @return true if the slice was newly pinned, false if it was already pinned + */ +template +bool pin(mdspan& s) +{ + if (address_is_pinned(s.data_handle())) { - switch (contiguous_dims(s)) - { - case 1: - for (size_t index_1 = 0; index_1 < s.extent(1); index_1++) - { - unpin_memory(&s(0, index_1) + index_1 * s.extent(0)); - } - break; - case 2: - // fprintf(stderr, "PIN 2D - contiguous\n"); - unpin_memory(s.data_handle()); - break; - default: - assert(false); - abort(); - } + return false; } - else + + // Roll back on any failure. unpin() tolerates hunks that were never pinned + // (the one that threw, plus the ones we never reached), so this leaves the + // slice fully unpinned -- consistent with the address_is_pinned() proxy above. + SCOPE(fail) { - static_assert(rank == 3, "Dimensionality not supported."); - switch (contiguous_dims(s)) - { - case 1: - for (size_t index_2 = 0; index_2 < s.extent(2); index_2++) - { - for (size_t index_1 = 0; index_1 < s.extent(1); index_1++) - { - // fprintf(stderr, "ADDR %d,%d,0 = %p \n", index_2, index_1, &s(index_2, index_1, 0)); - unpin_memory(&s(0, index_1, index_2)); - } - } - break; - case 2: - for (size_t index_2 = 0; index_2 < s.extent(2); index_2++) - { - unpin_memory(&s(0, 0, index_2)); - } - break; - case 3: - // fprintf(stderr, "PIN 3D - contiguous\n"); - unpin_memory(s.data_handle()); - break; - default: - assert(false); - abort(); - } - } + unpin(s); + }; + + reserved::for_each_contiguous_hunk(s, [](T* base, size_t n) { + cuda_try(pin_memory(base, n)); + }); + + return true; } _CCCL_DIAG_PUSH From d37dbfe8e46d6ee66b068c98c922b9e0d8a512d0 Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Wed, 3 Jun 2026 23:30:09 -0400 Subject: [PATCH 5/7] cudax/stf: destroy test streams via SCOPE(exit) in context.cuh unit tests In the two UNITTESTs that create a user stream, release it through a SCOPE(exit) guard right after creation instead of a trailing cudaStreamDestroy, so the stream is destroyed even if the test body throws. cuda_safe_call is used inside the noexcept SCOPE(exit) body. --- .../cuda/experimental/__stf/internal/context.cuh | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh index 50d66b3599f..b84b5dfcbfa 100644 --- a/cudax/include/cuda/experimental/__stf/internal/context.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh @@ -1122,6 +1122,10 @@ UNITTEST("context resources released on finalize non blocking") }; const cudaStream_t stream = cuda_try(); + SCOPE(exit) + { + cuda_safe_call(cudaStreamDestroy(stream)); + }; bool released = false; context ctx(stream, async_resources_handle()); @@ -1130,8 +1134,6 @@ UNITTEST("context resources released on finalize non blocking") EXPECT(!released); // not yet, callback not run cuda_try(stream); EXPECT(released); - - cuda_try(stream); }; UNITTEST("context import_resources_from") @@ -1182,6 +1184,10 @@ UNITTEST("context graph and stage") UNITTEST("context with arguments") { const cudaStream_t stream = cuda_try(); + SCOPE(exit) + { + cuda_safe_call(cudaStreamDestroy(stream)); + }; async_resources_handle h; @@ -1196,8 +1202,6 @@ UNITTEST("context with arguments") context ctx4 = graph_ctx(stream, h); ctx4.finalize(); - - cuda_try(stream); }; # if !defined(CUDASTF_DISABLE_CODE_GENERATION) && _CCCL_CUDA_COMPILATION() From 7089769922c32e1ad3e0ddfbfb442464561e57ad Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Thu, 4 Jun 2026 00:58:56 -0400 Subject: [PATCH 6/7] cudax/stf: check the cudaMemcpyAsync in a context unit test The cudaMemcpyAsync in this UNITTEST task body was unchecked; wrap it in cuda_safe_call so a failure is reported. (cuda_safe_call, not cuda_try: this runs inside the task-body callback where throwing is not safe.) --- cudax/include/cuda/experimental/__stf/internal/context.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cudax/include/cuda/experimental/__stf/internal/context.cuh b/cudax/include/cuda/experimental/__stf/internal/context.cuh index b84b5dfcbfa..3e7468618fc 100644 --- a/cudax/include/cuda/experimental/__stf/internal/context.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/context.cuh @@ -1566,7 +1566,7 @@ UNITTEST("context task") ctx.task(la.read(), lb.write())->*[](auto s, auto a, auto b) { // no-op - cudaMemcpyAsync(&b(0), &a(0), sizeof(int), cudaMemcpyDeviceToDevice, s); + cuda_safe_call(cudaMemcpyAsync(&b(0), &a(0), sizeof(int), cudaMemcpyDeviceToDevice, s)); }; ctx.finalize(); From 2d42937940faa290cfd2a16976b16d64a042246f Mon Sep 17 00:00:00 2001 From: Andrei Alexandrescu Date: Thu, 4 Jun 2026 18:22:31 -0400 Subject: [PATCH 7/7] cudax/stf: include scope_guard.cuh in ctx_resource.cuh ctx_resource.cuh uses SCOPE(exit) in the resource-release callback but only included core.cuh and cuda_safe_call.cuh, so SCOPE was undefined. Because ctx_resource.cuh is widely included, this broke the cudax build across the matrix. Add the missing scope_guard.cuh include. --- cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh b/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh index 7f8ad21c4c7..79eaa4a804f 100644 --- a/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/ctx_resource.cuh @@ -17,6 +17,7 @@ #include #include +#include #include #include