NVIDIA · andralex · Jun 8, 2026 · Jun 3, 2026 · Jun 3, 2026 · Jun 4, 2026
@@ -131,8 +131,8 @@ public:
 
   // This is movable, but we don't need to call clear anymore after moving
   stream_adapter(stream_adapter&& other) noexcept
-      : adapter_state(other.adapter_state)
-      , alloc(other.alloc)
+      : adapter_state(mv(other.adapter_state))
+      , alloc(mv(other.alloc))
       , cleared_or_moved(other.cleared_or_moved)
   {
     // No need to clear this now that it was moved

@@ -103,8 +103,7 @@ public:
       graph_cache[hashValue] = inner_graph;
     }
 
-    cudaGraphNode_t c;
-    cuda_safe_call(cudaGraphAddChildGraphNode(&c, graph, nullptr, 0, *inner_graph));
+    ::std::ignore = cuda_try<cudaGraphAddChildGraphNode>(graph, nullptr, 0, *inner_graph);
   }
 
   /* This simply executes the algorithm within the existing context. This
@@ -250,6 +249,12 @@ public:
     // instead. These resources need to be released later with .clear()
     auto adapter = setup_allocator(gctx, stream);
 
+    // Speaking of which.
+    SCOPE(exit)
+    {
+      adapter.clear();
+    };
+
     auto current_data_place = gctx.default_exec_place().affine_data_place();
 
     // Call fun with all arguments transformed to logical data
@@ -274,21 +279,18 @@ public:
 
     if (!eg)
     {
-      eg = ::std::shared_ptr<cudaGraphExec_t>(new cudaGraphExec_t, [](cudaGraphExec_t* p) {
-        cudaGraphExecDestroy(*p);
-      });
+      eg = {new cudaGraphExec_t{}, [](cudaGraphExec_t* p) {
+              cuda_safe_call(cudaGraphExecDestroy(*p));
+            }};
 
       dump_algorithm(gctx_graph);
 
-      cuda_try(cudaGraphInstantiateWithFlags(eg.get(), *gctx_graph, 0));
+      *eg = cuda_try<cudaGraphInstantiateWithFlags>(*gctx_graph, 0);
 
       cached_exec_graphs[stream].push_back(eg);
     }
 
-    cuda_safe_call(cudaGraphLaunch(*eg, stream));
-
-    // Free resources allocated through the adapter
-    adapter.clear();
+    cuda_try<cudaGraphLaunch>(*eg, stream);
   }
 
   /* Contrary to `run`, we here have a dynamic set of dependencies for the
@@ -307,6 +309,12 @@ public:
     // instead. These resources need to be released later with .clear()
     auto adapter = setup_allocator(gctx, stream);
 
+    // Speaking of which.
+    SCOPE(exit)
+    {
+      adapter.clear();
+    };
+
     auto current_place = gctx.default_exec_place();
 
     ::std::forward<Fun>(fun)(gctx, t);
@@ -327,22 +335,18 @@ public:
 
     if (!eg)
     {
-      auto cudaGraphExecDeleter = [](cudaGraphExec_t* pGraphExec) {
-        cudaGraphExecDestroy(*pGraphExec);
-      };
-      eg = ::std::shared_ptr<cudaGraphExec_t>(new cudaGraphExec_t, cudaGraphExecDeleter);
+      eg = {new cudaGraphExec_t{}, [](cudaGraphExec_t* p) {
+              cuda_safe_call(cudaGraphExecDestroy(*p));
+            }};
 
       dump_algorithm(gctx_graph);
 
-      cuda_try(cudaGraphInstantiateWithFlags(eg.get(), *gctx_graph, 0));
+      *eg = cuda_try<cudaGraphInstantiateWithFlags>(*gctx_graph, 0);
 
       cached_exec_graphs[stream].push_back(eg);
     }
 
-    cuda_safe_call(cudaGraphLaunch(*eg, stream));
-
-    // Free resources allocated through the adapter
-    adapter.clear();
+    cuda_try<cudaGraphLaunch>(*eg, stream);
   }
 
 private:
@@ -353,7 +357,7 @@ private:
     {
       static int print_to_dot_cnt = 0; // Warning: not thread-safe
       ::std::string filename      = "algo_" + symbol + "_" + ::std::to_string(print_to_dot_cnt++) + ".dot";
-      cudaGraphDebugDotPrint(*gctx_graph, filename.c_str(), cudaGraphDebugDotFlags(0));
+      cuda_safe_call(cudaGraphDebugDotPrint(*gctx_graph, filename.c_str(), cudaGraphDebugDotFlags(0)));
     }
   }
 

@@ -560,8 +560,7 @@ public:
   {
     if (getenv("CUDASTF_DOT_COLOR_BY_DEVICE"))
     {
-      int dev;
-      cuda_safe_call(cudaGetDevice(&dev));
+      const int dev = cuda_try<cudaGetDevice>();
       EXPECT(dev < sizeof(colors) / sizeof(*colors));
       current_color = colors[dev];
     }

@@ -53,14 +53,11 @@ inline bool try_updating_executable_graph(cudaGraphExec_t exec_graph, cudaGraph_
 // Instantiate a CUDA graph
 inline ::std::shared_ptr<cudaGraphExec_t> graph_instantiate(cudaGraph_t g)
 {
-  // Custom deleter specifically for cudaGraphExec_t
-  auto cudaGraphExecDeleter = [](cudaGraphExec_t* pGraphExec) {
-    cudaGraphExecDestroy(*pGraphExec);
-  };
-
-  ::std::shared_ptr<cudaGraphExec_t> res(new cudaGraphExec_t, cudaGraphExecDeleter);
+  ::std::shared_ptr<cudaGraphExec_t> res{new cudaGraphExec_t{}, [](cudaGraphExec_t* p) {
+                                           cuda_safe_call(cudaGraphExecDestroy(*p));
+                                         }};
 
-  cuda_try(cudaGraphInstantiateWithFlags(res.get(), g, 0));
+  *res = cuda_try<cudaGraphInstantiateWithFlags>(g, 0);
 
   return res;
 }
@@ -100,8 +97,7 @@ public:
       cache_size_limit = atol(str) * 1024 * 1024;
     }
 
-    int ndevices;
-    cuda_safe_call(cudaGetDeviceCount(&ndevices));
+    const int ndevices = cuda_try<cudaGetDeviceCount>();
 
     // One individual cache per device (TODO per execution place at some point
     // if we consider green contexts or multi-gpu graphs ?)

@@ -131,7 +131,7 @@ public:
 
     size_t sz = sizeof(T);
 
-    cuda_safe_call(cudaMemcpyAsync((void*) dst_instance.addr, (void*) src_instance.addr, sz, kind, stream));
+    cuda_try(cudaMemcpyAsync((void*) dst_instance.addr, (void*) src_instance.addr, sz, kind, stream));
   }
 
   void data_allocate(
@@ -267,9 +267,7 @@ public:
       .extent   = make_cudaExtent(sizeof(T), 1, 1),
       .kind     = kind};
 
-    cudaGraphNode_t result;
-    cuda_safe_call(cudaGraphAddMemcpyNode(&result, graph, input_nodes, input_cnt, &cpy_params));
-    return result;
+    return cuda_try<cudaGraphAddMemcpyNode>(graph, input_nodes, input_cnt, &cpy_params);
   }
 
   bool pin_host_memory(instance_id_t instance_id) override

@@ -52,8 +52,8 @@ class scheduler
 {
 public:
   scheduler()
+      : num_devices(cuda_try<cudaGetDeviceCount>())
   {
-    cuda_safe_call(cudaGetDeviceCount(&num_devices));
     assert(num_devices > 0);
   }
 

@@ -442,7 +442,7 @@ UNITTEST("thread hierarchy indexing")
   auto config = p.get_config();
   reserved::unit_test_thread_hierarchy<<<config[1], config[2]>>>(h);
 
-  cuda_safe_call(cudaDeviceSynchronize());
+  cuda_try(cudaDeviceSynchronize());
 };
 
 namespace reserved
@@ -473,15 +473,15 @@ UNITTEST("thread hierarchy sync")
   auto config = p.get_config();
 
   void* args[] = {&h};
-  cuda_safe_call(cudaLaunchCooperativeKernel(
+  cuda_try(cudaLaunchCooperativeKernel(
     (void*) reserved::unit_test_thread_hierarchy_sync<true, size_t(0), true, size_t(1)>,
     config[1],
     config[2],
     args,
     0,
     0));
 
-  cuda_safe_call(cudaDeviceSynchronize());
+  cuda_try(cudaDeviceSynchronize());
 };
 
 namespace reserved
@@ -511,7 +511,7 @@ UNITTEST("thread hierarchy inner sync")
   auto config = p.get_config();
   reserved::unit_test_thread_hierarchy_inner_sync<false, size_t(0), true, size_t(0)><<<config[1], config[2]>>>(h);
 
-  cuda_safe_call(cudaDeviceSynchronize());
+  cuda_try(cudaDeviceSynchronize());
 };
 
 #  endif // !defined(CUDASTF_DISABLE_CODE_GENERATION) && _CCCL_CUDA_COMPILATION()