NVIDIA · andralex · Jun 3, 2026 · Jun 3, 2026 · Jun 4, 2026 · Jun 4, 2026
@@ -104,7 +104,7 @@ public:
     }
 
     cudaGraphNode_t c;
-    cuda_safe_call(cudaGraphAddChildGraphNode(&c, graph, nullptr, 0, *inner_graph));
+    cuda_try(cudaGraphAddChildGraphNode(&c, graph, nullptr, 0, *inner_graph));
   }
 
   /* This simply executes the algorithm within the existing context. This
@@ -285,7 +285,7 @@ public:
       cached_exec_graphs[stream].push_back(eg);
     }
 
-    cuda_safe_call(cudaGraphLaunch(*eg, stream));
+    cuda_try(cudaGraphLaunch(*eg, stream));
 
     // Free resources allocated through the adapter
     adapter.clear();
@@ -339,7 +339,7 @@ public:
       cached_exec_graphs[stream].push_back(eg);
     }
 
-    cuda_safe_call(cudaGraphLaunch(*eg, stream));
+    cuda_try(cudaGraphLaunch(*eg, stream));
 
     // Free resources allocated through the adapter
     adapter.clear();

@@ -560,8 +560,7 @@ public:
   {
     if (getenv("CUDASTF_DOT_COLOR_BY_DEVICE"))
     {
-      int dev;
-      cuda_safe_call(cudaGetDevice(&dev));
+      const int dev = cuda_try<cudaGetDevice>();
       EXPECT(dev < sizeof(colors) / sizeof(*colors));
       current_color = colors[dev];
     }

@@ -100,8 +100,7 @@ public:
       cache_size_limit = atol(str) * 1024 * 1024;
     }
 
-    int ndevices;
-    cuda_safe_call(cudaGetDeviceCount(&ndevices));
+    const int ndevices = cuda_try<cudaGetDeviceCount>();
 
     // One individual cache per device (TODO per execution place at some point
     // if we consider green contexts or multi-gpu graphs ?)

@@ -131,7 +131,7 @@ public:
 
     size_t sz = sizeof(T);
 
-    cuda_safe_call(cudaMemcpyAsync((void*) dst_instance.addr, (void*) src_instance.addr, sz, kind, stream));
+    cuda_try(cudaMemcpyAsync((void*) dst_instance.addr, (void*) src_instance.addr, sz, kind, stream));
   }
 
   void data_allocate(
@@ -267,9 +267,7 @@ public:
       .extent   = make_cudaExtent(sizeof(T), 1, 1),
       .kind     = kind};
 
-    cudaGraphNode_t result;
-    cuda_safe_call(cudaGraphAddMemcpyNode(&result, graph, input_nodes, input_cnt, &cpy_params));
-    return result;
+    return cuda_try<cudaGraphAddMemcpyNode>(graph, input_nodes, input_cnt, &cpy_params);
   }
 
   bool pin_host_memory(instance_id_t instance_id) override

@@ -53,7 +53,7 @@ class scheduler
 public:
   scheduler()
   {
-    cuda_safe_call(cudaGetDeviceCount(&num_devices));
+    num_devices = cuda_try<cudaGetDeviceCount>();
     assert(num_devices > 0);
   }
 

@@ -442,7 +442,7 @@ UNITTEST("thread hierarchy indexing")
   auto config = p.get_config();
   reserved::unit_test_thread_hierarchy<<<config[1], config[2]>>>(h);
 
-  cuda_safe_call(cudaDeviceSynchronize());
+  cuda_try(cudaDeviceSynchronize());
 };
 
 namespace reserved
@@ -473,15 +473,15 @@ UNITTEST("thread hierarchy sync")
   auto config = p.get_config();
 
   void* args[] = {&h};
-  cuda_safe_call(cudaLaunchCooperativeKernel(
+  cuda_try(cudaLaunchCooperativeKernel(
     (void*) reserved::unit_test_thread_hierarchy_sync<true, size_t(0), true, size_t(1)>,
     config[1],
     config[2],
     args,
     0,
     0));
 
-  cuda_safe_call(cudaDeviceSynchronize());
+  cuda_try(cudaDeviceSynchronize());
 };
 
 namespace reserved
@@ -511,7 +511,7 @@ UNITTEST("thread hierarchy inner sync")
   auto config = p.get_config();
   reserved::unit_test_thread_hierarchy_inner_sync<false, size_t(0), true, size_t(0)><<<config[1], config[2]>>>(h);
 
-  cuda_safe_call(cudaDeviceSynchronize());
+  cuda_try(cudaDeviceSynchronize());
 };
 
 #  endif // !defined(CUDASTF_DISABLE_CODE_GENERATION) && _CCCL_CUDA_COMPILATION()