Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,8 @@ public:

// This is movable, but we don't need to call clear anymore after moving
stream_adapter(stream_adapter&& other) noexcept
: adapter_state(other.adapter_state)
, alloc(other.alloc)
: adapter_state(mv(other.adapter_state))
, alloc(mv(other.alloc))
, cleared_or_moved(other.cleared_or_moved)
{
// No need to clear this now that it was moved
Expand Down
42 changes: 23 additions & 19 deletions cudax/include/cuda/experimental/__stf/internal/algorithm.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,7 @@ public:
graph_cache[hashValue] = inner_graph;
}

cudaGraphNode_t c;
cuda_safe_call(cudaGraphAddChildGraphNode(&c, graph, nullptr, 0, *inner_graph));
::std::ignore = cuda_try<cudaGraphAddChildGraphNode>(graph, nullptr, 0, *inner_graph);
}

/* This simply executes the algorithm within the existing context. This
Expand Down Expand Up @@ -250,6 +249,12 @@ public:
// instead. These resources need to be released later with .clear()
auto adapter = setup_allocator(gctx, stream);

// Speaking of which.
SCOPE(exit)
{
adapter.clear();
};

auto current_data_place = gctx.default_exec_place().affine_data_place();

// Call fun with all arguments transformed to logical data
Expand All @@ -274,21 +279,18 @@ public:

if (!eg)
{
eg = ::std::shared_ptr<cudaGraphExec_t>(new cudaGraphExec_t, [](cudaGraphExec_t* p) {
cudaGraphExecDestroy(*p);
});
eg = {new cudaGraphExec_t{}, [](cudaGraphExec_t* p) {
cudaGraphExecDestroy(*p);
}};

dump_algorithm(gctx_graph);

cuda_try(cudaGraphInstantiateWithFlags(eg.get(), *gctx_graph, 0));
*eg = cuda_try<cudaGraphInstantiateWithFlags>(*gctx_graph, 0);

cached_exec_graphs[stream].push_back(eg);
}

cuda_safe_call(cudaGraphLaunch(*eg, stream));

// Free resources allocated through the adapter
adapter.clear();
cuda_try<cudaGraphLaunch>(*eg, stream);
}

/* Contrary to `run`, we here have a dynamic set of dependencies for the
Expand All @@ -307,6 +309,12 @@ public:
// instead. These resources need to be released later with .clear()
auto adapter = setup_allocator(gctx, stream);

// Speaking of which.
SCOPE(exit)
{
adapter.clear();
};

auto current_place = gctx.default_exec_place();

::std::forward<Fun>(fun)(gctx, t);
Expand All @@ -327,22 +335,18 @@ public:

if (!eg)
{
auto cudaGraphExecDeleter = [](cudaGraphExec_t* pGraphExec) {
cudaGraphExecDestroy(*pGraphExec);
};
eg = ::std::shared_ptr<cudaGraphExec_t>(new cudaGraphExec_t, cudaGraphExecDeleter);
eg = {new cudaGraphExec_t{}, [](cudaGraphExec_t* p) {
cudaGraphExecDestroy(*p);
}};

dump_algorithm(gctx_graph);

cuda_try(cudaGraphInstantiateWithFlags(eg.get(), *gctx_graph, 0));
*eg = cuda_try<cudaGraphInstantiateWithFlags>(*gctx_graph, 0);

cached_exec_graphs[stream].push_back(eg);
}

cuda_safe_call(cudaGraphLaunch(*eg, stream));

// Free resources allocated through the adapter
adapter.clear();
cuda_try<cudaGraphLaunch>(*eg, stream);
}

private:
Expand Down
3 changes: 1 addition & 2 deletions cudax/include/cuda/experimental/__stf/internal/dot.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -560,8 +560,7 @@ public:
{
if (getenv("CUDASTF_DOT_COLOR_BY_DEVICE"))
{
int dev;
cuda_safe_call(cudaGetDevice(&dev));
const int dev = cuda_try<cudaGetDevice>();
EXPECT(dev < sizeof(colors) / sizeof(*colors));
current_color = colors[dev];
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,11 @@ inline bool try_updating_executable_graph(cudaGraphExec_t exec_graph, cudaGraph_
// Instantiate a CUDA graph
inline ::std::shared_ptr<cudaGraphExec_t> graph_instantiate(cudaGraph_t g)
{
// Custom deleter specifically for cudaGraphExec_t
auto cudaGraphExecDeleter = [](cudaGraphExec_t* pGraphExec) {
cudaGraphExecDestroy(*pGraphExec);
};

::std::shared_ptr<cudaGraphExec_t> res(new cudaGraphExec_t, cudaGraphExecDeleter);
::std::shared_ptr<cudaGraphExec_t> res{new cudaGraphExec_t{}, [](cudaGraphExec_t* p) {
cudaGraphExecDestroy(*p);
}};

cuda_try(cudaGraphInstantiateWithFlags(res.get(), g, 0));
*res = cuda_try<cudaGraphInstantiateWithFlags>(g, 0);

return res;
}
Expand Down Expand Up @@ -100,8 +97,7 @@ public:
cache_size_limit = atol(str) * 1024 * 1024;
}

int ndevices;
cuda_safe_call(cudaGetDeviceCount(&ndevices));
const int ndevices = cuda_try<cudaGetDeviceCount>();

// One individual cache per device (TODO per execution place at some point
// if we consider green contexts or multi-gpu graphs ?)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ public:

size_t sz = sizeof(T);

cuda_safe_call(cudaMemcpyAsync((void*) dst_instance.addr, (void*) src_instance.addr, sz, kind, stream));
cuda_try(cudaMemcpyAsync((void*) dst_instance.addr, (void*) src_instance.addr, sz, kind, stream));
}

void data_allocate(
Expand Down Expand Up @@ -267,9 +267,7 @@ public:
.extent = make_cudaExtent(sizeof(T), 1, 1),
.kind = kind};

cudaGraphNode_t result;
cuda_safe_call(cudaGraphAddMemcpyNode(&result, graph, input_nodes, input_cnt, &cpy_params));
return result;
return cuda_try<cudaGraphAddMemcpyNode>(graph, input_nodes, input_cnt, &cpy_params);
}

bool pin_host_memory(instance_id_t instance_id) override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ class scheduler
{
public:
scheduler()
: num_devices(cuda_try<cudaGetDeviceCount>())
{
cuda_safe_call(cudaGetDeviceCount(&num_devices));
assert(num_devices > 0);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ UNITTEST("thread hierarchy indexing")
auto config = p.get_config();
reserved::unit_test_thread_hierarchy<<<config[1], config[2]>>>(h);

cuda_safe_call(cudaDeviceSynchronize());
cuda_try(cudaDeviceSynchronize());
};

namespace reserved
Expand Down Expand Up @@ -473,15 +473,15 @@ UNITTEST("thread hierarchy sync")
auto config = p.get_config();

void* args[] = {&h};
cuda_safe_call(cudaLaunchCooperativeKernel(
cuda_try(cudaLaunchCooperativeKernel(
(void*) reserved::unit_test_thread_hierarchy_sync<true, size_t(0), true, size_t(1)>,
config[1],
config[2],
args,
0,
0));

cuda_safe_call(cudaDeviceSynchronize());
cuda_try(cudaDeviceSynchronize());
};

namespace reserved
Expand Down Expand Up @@ -511,7 +511,7 @@ UNITTEST("thread hierarchy inner sync")
auto config = p.get_config();
reserved::unit_test_thread_hierarchy_inner_sync<false, size_t(0), true, size_t(0)><<<config[1], config[2]>>>(h);

cuda_safe_call(cudaDeviceSynchronize());
cuda_try(cudaDeviceSynchronize());
};

# endif // !defined(CUDASTF_DISABLE_CODE_GENERATION) && _CCCL_CUDA_COMPILATION()
Expand Down