Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,8 @@ public:

// This is movable, but we don't need to call clear anymore after moving
stream_adapter(stream_adapter&& other) noexcept
: adapter_state(other.adapter_state)
, alloc(other.alloc)
: adapter_state(mv(other.adapter_state))
, alloc(mv(other.alloc))
, cleared_or_moved(other.cleared_or_moved)
{
// No need to clear this now that it was moved
Expand Down
44 changes: 24 additions & 20 deletions cudax/include/cuda/experimental/__stf/internal/algorithm.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,7 @@ public:
graph_cache[hashValue] = inner_graph;
}

cudaGraphNode_t c;
cuda_safe_call(cudaGraphAddChildGraphNode(&c, graph, nullptr, 0, *inner_graph));
::std::ignore = cuda_try<cudaGraphAddChildGraphNode>(graph, nullptr, 0, *inner_graph);
}

/* This simply executes the algorithm within the existing context. This
Expand Down Expand Up @@ -250,6 +249,12 @@ public:
// instead. These resources need to be released later with .clear()
auto adapter = setup_allocator(gctx, stream);

// Speaking of which.
SCOPE(exit)
{
adapter.clear();
};

auto current_data_place = gctx.default_exec_place().affine_data_place();

// Call fun with all arguments transformed to logical data
Expand All @@ -274,21 +279,18 @@ public:

if (!eg)
{
eg = ::std::shared_ptr<cudaGraphExec_t>(new cudaGraphExec_t, [](cudaGraphExec_t* p) {
cudaGraphExecDestroy(*p);
});
eg = {new cudaGraphExec_t{}, [](cudaGraphExec_t* p) {
cuda_safe_call(cudaGraphExecDestroy(*p));
}};

dump_algorithm(gctx_graph);

cuda_try(cudaGraphInstantiateWithFlags(eg.get(), *gctx_graph, 0));
*eg = cuda_try<cudaGraphInstantiateWithFlags>(*gctx_graph, 0);

cached_exec_graphs[stream].push_back(eg);
}

cuda_safe_call(cudaGraphLaunch(*eg, stream));

// Free resources allocated through the adapter
adapter.clear();
cuda_try<cudaGraphLaunch>(*eg, stream);
}

/* Contrary to `run`, we here have a dynamic set of dependencies for the
Expand All @@ -307,6 +309,12 @@ public:
// instead. These resources need to be released later with .clear()
auto adapter = setup_allocator(gctx, stream);

// Speaking of which.
SCOPE(exit)
{
adapter.clear();
};

auto current_place = gctx.default_exec_place();

::std::forward<Fun>(fun)(gctx, t);
Expand All @@ -327,22 +335,18 @@ public:

if (!eg)
{
auto cudaGraphExecDeleter = [](cudaGraphExec_t* pGraphExec) {
cudaGraphExecDestroy(*pGraphExec);
};
eg = ::std::shared_ptr<cudaGraphExec_t>(new cudaGraphExec_t, cudaGraphExecDeleter);
eg = {new cudaGraphExec_t{}, [](cudaGraphExec_t* p) {
cuda_safe_call(cudaGraphExecDestroy(*p));
}};

dump_algorithm(gctx_graph);

cuda_try(cudaGraphInstantiateWithFlags(eg.get(), *gctx_graph, 0));
*eg = cuda_try<cudaGraphInstantiateWithFlags>(*gctx_graph, 0);

cached_exec_graphs[stream].push_back(eg);
}

cuda_safe_call(cudaGraphLaunch(*eg, stream));

// Free resources allocated through the adapter
adapter.clear();
cuda_try<cudaGraphLaunch>(*eg, stream);
}

private:
Expand All @@ -353,7 +357,7 @@ private:
{
static int print_to_dot_cnt = 0; // Warning: not thread-safe
::std::string filename = "algo_" + symbol + "_" + ::std::to_string(print_to_dot_cnt++) + ".dot";
cudaGraphDebugDotPrint(*gctx_graph, filename.c_str(), cudaGraphDebugDotFlags(0));
cuda_safe_call(cudaGraphDebugDotPrint(*gctx_graph, filename.c_str(), cudaGraphDebugDotFlags(0)));
}
}

Expand Down
3 changes: 1 addition & 2 deletions cudax/include/cuda/experimental/__stf/internal/dot.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -560,8 +560,7 @@ public:
{
if (getenv("CUDASTF_DOT_COLOR_BY_DEVICE"))
{
int dev;
cuda_safe_call(cudaGetDevice(&dev));
const int dev = cuda_try<cudaGetDevice>();
EXPECT(dev < sizeof(colors) / sizeof(*colors));
current_color = colors[dev];
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,14 +53,11 @@ inline bool try_updating_executable_graph(cudaGraphExec_t exec_graph, cudaGraph_
// Instantiate a CUDA graph
inline ::std::shared_ptr<cudaGraphExec_t> graph_instantiate(cudaGraph_t g)
{
// Custom deleter specifically for cudaGraphExec_t
auto cudaGraphExecDeleter = [](cudaGraphExec_t* pGraphExec) {
cudaGraphExecDestroy(*pGraphExec);
};

::std::shared_ptr<cudaGraphExec_t> res(new cudaGraphExec_t, cudaGraphExecDeleter);
::std::shared_ptr<cudaGraphExec_t> res{new cudaGraphExec_t{}, [](cudaGraphExec_t* p) {
cuda_safe_call(cudaGraphExecDestroy(*p));
}};

cuda_try(cudaGraphInstantiateWithFlags(res.get(), g, 0));
*res = cuda_try<cudaGraphInstantiateWithFlags>(g, 0);

return res;
}
Expand Down Expand Up @@ -100,8 +97,7 @@ public:
cache_size_limit = atol(str) * 1024 * 1024;
}

int ndevices;
cuda_safe_call(cudaGetDeviceCount(&ndevices));
const int ndevices = cuda_try<cudaGetDeviceCount>();

// One individual cache per device (TODO per execution place at some point
// if we consider green contexts or multi-gpu graphs ?)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ public:

size_t sz = sizeof(T);

cuda_safe_call(cudaMemcpyAsync((void*) dst_instance.addr, (void*) src_instance.addr, sz, kind, stream));
cuda_try(cudaMemcpyAsync((void*) dst_instance.addr, (void*) src_instance.addr, sz, kind, stream));
}

void data_allocate(
Expand Down Expand Up @@ -267,9 +267,7 @@ public:
.extent = make_cudaExtent(sizeof(T), 1, 1),
.kind = kind};

cudaGraphNode_t result;
cuda_safe_call(cudaGraphAddMemcpyNode(&result, graph, input_nodes, input_cnt, &cpy_params));
return result;
return cuda_try<cudaGraphAddMemcpyNode>(graph, input_nodes, input_cnt, &cpy_params);
}

bool pin_host_memory(instance_id_t instance_id) override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,8 @@ class scheduler
{
public:
scheduler()
: num_devices(cuda_try<cudaGetDeviceCount>())
{
cuda_safe_call(cudaGetDeviceCount(&num_devices));
assert(num_devices > 0);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -442,7 +442,7 @@ UNITTEST("thread hierarchy indexing")
auto config = p.get_config();
reserved::unit_test_thread_hierarchy<<<config[1], config[2]>>>(h);

cuda_safe_call(cudaDeviceSynchronize());
cuda_try(cudaDeviceSynchronize());
};

namespace reserved
Expand Down Expand Up @@ -473,15 +473,15 @@ UNITTEST("thread hierarchy sync")
auto config = p.get_config();

void* args[] = {&h};
cuda_safe_call(cudaLaunchCooperativeKernel(
cuda_try(cudaLaunchCooperativeKernel(
(void*) reserved::unit_test_thread_hierarchy_sync<true, size_t(0), true, size_t(1)>,
config[1],
config[2],
args,
0,
0));

cuda_safe_call(cudaDeviceSynchronize());
cuda_try(cudaDeviceSynchronize());
};

namespace reserved
Expand Down Expand Up @@ -511,7 +511,7 @@ UNITTEST("thread hierarchy inner sync")
auto config = p.get_config();
reserved::unit_test_thread_hierarchy_inner_sync<false, size_t(0), true, size_t(0)><<<config[1], config[2]>>>(h);

cuda_safe_call(cudaDeviceSynchronize());
cuda_try(cudaDeviceSynchronize());
};

# endif // !defined(CUDASTF_DISABLE_CODE_GENERATION) && _CCCL_CUDA_COMPILATION()
Expand Down
Loading