diff --git a/cudax/include/cuda/experimental/__stf/allocators/adapters.cuh b/cudax/include/cuda/experimental/__stf/allocators/adapters.cuh index 2cce7bed289..f63c3aa5686 100644 --- a/cudax/include/cuda/experimental/__stf/allocators/adapters.cuh +++ b/cudax/include/cuda/experimental/__stf/allocators/adapters.cuh @@ -131,8 +131,8 @@ public: // This is movable, but we don't need to call clear anymore after moving stream_adapter(stream_adapter&& other) noexcept - : adapter_state(other.adapter_state) - , alloc(other.alloc) + : adapter_state(mv(other.adapter_state)) + , alloc(mv(other.alloc)) , cleared_or_moved(other.cleared_or_moved) { // No need to clear this now that it was moved diff --git a/cudax/include/cuda/experimental/__stf/internal/algorithm.cuh b/cudax/include/cuda/experimental/__stf/internal/algorithm.cuh index e88d8d699c9..6a81e0b73d6 100644 --- a/cudax/include/cuda/experimental/__stf/internal/algorithm.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/algorithm.cuh @@ -103,8 +103,7 @@ public: graph_cache[hashValue] = inner_graph; } - cudaGraphNode_t c; - cuda_safe_call(cudaGraphAddChildGraphNode(&c, graph, nullptr, 0, *inner_graph)); + ::std::ignore = cuda_try(graph, nullptr, 0, *inner_graph); } /* This simply executes the algorithm within the existing context. This @@ -250,6 +249,12 @@ public: // instead. These resources need to be released later with .clear() auto adapter = setup_allocator(gctx, stream); + // Speaking of which. + SCOPE(exit) + { + adapter.clear(); + }; + auto current_data_place = gctx.default_exec_place().affine_data_place(); // Call fun with all arguments transformed to logical data @@ -274,21 +279,18 @@ public: if (!eg) { - eg = ::std::shared_ptr(new cudaGraphExec_t, [](cudaGraphExec_t* p) { - cudaGraphExecDestroy(*p); - }); + eg = {new cudaGraphExec_t{}, [](cudaGraphExec_t* p) { + cuda_safe_call(cudaGraphExecDestroy(*p)); + }}; dump_algorithm(gctx_graph); - cuda_try(cudaGraphInstantiateWithFlags(eg.get(), *gctx_graph, 0)); + *eg = cuda_try(*gctx_graph, 0); cached_exec_graphs[stream].push_back(eg); } - cuda_safe_call(cudaGraphLaunch(*eg, stream)); - - // Free resources allocated through the adapter - adapter.clear(); + cuda_try(*eg, stream); } /* Contrary to `run`, we here have a dynamic set of dependencies for the @@ -307,6 +309,12 @@ public: // instead. These resources need to be released later with .clear() auto adapter = setup_allocator(gctx, stream); + // Speaking of which. + SCOPE(exit) + { + adapter.clear(); + }; + auto current_place = gctx.default_exec_place(); ::std::forward(fun)(gctx, t); @@ -327,22 +335,18 @@ public: if (!eg) { - auto cudaGraphExecDeleter = [](cudaGraphExec_t* pGraphExec) { - cudaGraphExecDestroy(*pGraphExec); - }; - eg = ::std::shared_ptr(new cudaGraphExec_t, cudaGraphExecDeleter); + eg = {new cudaGraphExec_t{}, [](cudaGraphExec_t* p) { + cuda_safe_call(cudaGraphExecDestroy(*p)); + }}; dump_algorithm(gctx_graph); - cuda_try(cudaGraphInstantiateWithFlags(eg.get(), *gctx_graph, 0)); + *eg = cuda_try(*gctx_graph, 0); cached_exec_graphs[stream].push_back(eg); } - cuda_safe_call(cudaGraphLaunch(*eg, stream)); - - // Free resources allocated through the adapter - adapter.clear(); + cuda_try(*eg, stream); } private: @@ -353,7 +357,7 @@ private: { static int print_to_dot_cnt = 0; // Warning: not thread-safe ::std::string filename = "algo_" + symbol + "_" + ::std::to_string(print_to_dot_cnt++) + ".dot"; - cudaGraphDebugDotPrint(*gctx_graph, filename.c_str(), cudaGraphDebugDotFlags(0)); + cuda_safe_call(cudaGraphDebugDotPrint(*gctx_graph, filename.c_str(), cudaGraphDebugDotFlags(0))); } } diff --git a/cudax/include/cuda/experimental/__stf/internal/dot.cuh b/cudax/include/cuda/experimental/__stf/internal/dot.cuh index fd42a8b0c49..887ef0019a5 100644 --- a/cudax/include/cuda/experimental/__stf/internal/dot.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/dot.cuh @@ -560,8 +560,7 @@ public: { if (getenv("CUDASTF_DOT_COLOR_BY_DEVICE")) { - int dev; - cuda_safe_call(cudaGetDevice(&dev)); + const int dev = cuda_try(); EXPECT(dev < sizeof(colors) / sizeof(*colors)); current_color = colors[dev]; } diff --git a/cudax/include/cuda/experimental/__stf/internal/executable_graph_cache.cuh b/cudax/include/cuda/experimental/__stf/internal/executable_graph_cache.cuh index 635d1bd17bd..1ff03a5f588 100644 --- a/cudax/include/cuda/experimental/__stf/internal/executable_graph_cache.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/executable_graph_cache.cuh @@ -53,14 +53,11 @@ inline bool try_updating_executable_graph(cudaGraphExec_t exec_graph, cudaGraph_ // Instantiate a CUDA graph inline ::std::shared_ptr graph_instantiate(cudaGraph_t g) { - // Custom deleter specifically for cudaGraphExec_t - auto cudaGraphExecDeleter = [](cudaGraphExec_t* pGraphExec) { - cudaGraphExecDestroy(*pGraphExec); - }; - - ::std::shared_ptr res(new cudaGraphExec_t, cudaGraphExecDeleter); + ::std::shared_ptr res{new cudaGraphExec_t{}, [](cudaGraphExec_t* p) { + cuda_safe_call(cudaGraphExecDestroy(*p)); + }}; - cuda_try(cudaGraphInstantiateWithFlags(res.get(), g, 0)); + *res = cuda_try(g, 0); return res; } @@ -100,8 +97,7 @@ public: cache_size_limit = atol(str) * 1024 * 1024; } - int ndevices; - cuda_safe_call(cudaGetDeviceCount(&ndevices)); + const int ndevices = cuda_try(); // One individual cache per device (TODO per execution place at some point // if we consider green contexts or multi-gpu graphs ?) diff --git a/cudax/include/cuda/experimental/__stf/internal/scalar_interface.cuh b/cudax/include/cuda/experimental/__stf/internal/scalar_interface.cuh index e6a515cd380..2eba93a0aac 100644 --- a/cudax/include/cuda/experimental/__stf/internal/scalar_interface.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/scalar_interface.cuh @@ -131,7 +131,7 @@ public: size_t sz = sizeof(T); - cuda_safe_call(cudaMemcpyAsync((void*) dst_instance.addr, (void*) src_instance.addr, sz, kind, stream)); + cuda_try(cudaMemcpyAsync((void*) dst_instance.addr, (void*) src_instance.addr, sz, kind, stream)); } void data_allocate( @@ -267,9 +267,7 @@ public: .extent = make_cudaExtent(sizeof(T), 1, 1), .kind = kind}; - cudaGraphNode_t result; - cuda_safe_call(cudaGraphAddMemcpyNode(&result, graph, input_nodes, input_cnt, &cpy_params)); - return result; + return cuda_try(graph, input_nodes, input_cnt, &cpy_params); } bool pin_host_memory(instance_id_t instance_id) override diff --git a/cudax/include/cuda/experimental/__stf/internal/scheduler.cuh b/cudax/include/cuda/experimental/__stf/internal/scheduler.cuh index e6af82783ec..7e103cefc1f 100644 --- a/cudax/include/cuda/experimental/__stf/internal/scheduler.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/scheduler.cuh @@ -52,8 +52,8 @@ class scheduler { public: scheduler() + : num_devices(cuda_try()) { - cuda_safe_call(cudaGetDeviceCount(&num_devices)); assert(num_devices > 0); } diff --git a/cudax/include/cuda/experimental/__stf/internal/thread_hierarchy.cuh b/cudax/include/cuda/experimental/__stf/internal/thread_hierarchy.cuh index c0031c5068e..eb8e44a9cce 100644 --- a/cudax/include/cuda/experimental/__stf/internal/thread_hierarchy.cuh +++ b/cudax/include/cuda/experimental/__stf/internal/thread_hierarchy.cuh @@ -442,7 +442,7 @@ UNITTEST("thread hierarchy indexing") auto config = p.get_config(); reserved::unit_test_thread_hierarchy<<>>(h); - cuda_safe_call(cudaDeviceSynchronize()); + cuda_try(cudaDeviceSynchronize()); }; namespace reserved @@ -473,7 +473,7 @@ UNITTEST("thread hierarchy sync") auto config = p.get_config(); void* args[] = {&h}; - cuda_safe_call(cudaLaunchCooperativeKernel( + cuda_try(cudaLaunchCooperativeKernel( (void*) reserved::unit_test_thread_hierarchy_sync, config[1], config[2], @@ -481,7 +481,7 @@ UNITTEST("thread hierarchy sync") 0, 0)); - cuda_safe_call(cudaDeviceSynchronize()); + cuda_try(cudaDeviceSynchronize()); }; namespace reserved @@ -511,7 +511,7 @@ UNITTEST("thread hierarchy inner sync") auto config = p.get_config(); reserved::unit_test_thread_hierarchy_inner_sync<<>>(h); - cuda_safe_call(cudaDeviceSynchronize()); + cuda_try(cudaDeviceSynchronize()); }; # endif // !defined(CUDASTF_DISABLE_CODE_GENERATION) && _CCCL_CUDA_COMPILATION()