Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 43 additions & 1 deletion cudax/include/cuda/experimental/__coop/reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -196,9 +196,51 @@ __reduce_impl(this_grid<_Hierarchy> __group, _Tp (&__thread_data)[_Np], _RedFn _
return ::cuda::std::nullopt;
}

_CCCL_TEMPLATE(class _Group, class _Tp, ::cuda::std::size_t _Np, class _RedFn)
_CCCL_REQUIRES(::cuda::std::is_same_v<warp_level, typename _Group::unit_type>
_CCCL_AND ::cuda::std::is_same_v<block_level, typename _Group::level_type>)
[[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp>
__reduce_impl(_Group __group, _Tp (&__thread_data)[_Np], _RedFn __red_fn)
{
constexpr auto __nwarps_in_group = warp.static_count(__group);
static_assert(__nwarps_in_group != ::cuda::std::dynamic_extent,
"cuda::coop::reduce requires the group to have statically known size");

using _WarpReduce = ::cub::WarpReduce<_Tp>;
union _Scratch
{
typename _WarpReduce::TempStorage __warp_reduce_[__nwarps_in_group];
_Tp __partials_[__nwarps_in_group];
};
__shared__ _Scratch __scratch;

const auto __partial = _WarpReduce{__scratch.__warp_reduce_[warp.rank(__group)]}.Reduce(__thread_data, __red_fn);
__group.sync_aligned();

this_warp __warp{__group.hierarchy()};
if (gpu_thread.is_root_rank(__warp))
{
__scratch.__partials_[warp.rank(__group)] = __partial;
}
__group.sync_aligned();

if (warp.is_root_rank(__group))
{
const auto __value = (gpu_thread.rank(__warp) < __nwarps_in_group)
? __scratch.__partials_[gpu_thread.rank(__warp)]
: ::cuda::identity_element<_RedFn, _Tp>();
const auto __result = _WarpReduce{__scratch.__warp_reduce_[0]}.Reduce(__value, __red_fn);
if (gpu_thread.is_root_rank(__warp))
{
return ::cuda::std::optional{__result};
}
}
return ::cuda::std::nullopt;
}

template <class _Group, class _Tp, ::cuda::std::size_t _Np, class _RedFn>
[[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp>
reduce(_Group __group, _Tp (&__thread_data)[_Np], _RedFn&& __red_fn)
reduce(_Group __group, _Tp (&__thread_data)[_Np], _RedFn __red_fn)
{
static_assert(gpu_thread.static_count(__group) != ::cuda::std::dynamic_extent,
"cuda::coop::reduce requires the group to have statically known size");
Expand Down
6 changes: 3 additions & 3 deletions cudax/include/cuda/experimental/__group/queries.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,9 @@ template <class _Tp, class _Unit, class _Group>
}
else
{
const auto __unit_rank = __rank_query<_Unit, _GroupUnit>::template __call<_Tp>(__group.hierarchy());
const auto __group_unit_count = ::cuda::experimental::__count_query_group<_Tp, _Unit>(__group);
return static_cast<_Tp>(__group_unit_rank * __group_unit_count + __unit_rank);
const auto __unit_rank = __rank_query<_Unit, _GroupUnit>::template __call<_Tp>(__group.hierarchy());
const auto __unit_count = __count_query<_Unit, _GroupUnit>::template __call<_Tp>(__group.hierarchy());
return static_cast<_Tp>(__group_unit_rank * __unit_count + __unit_rank);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,9 +123,18 @@ public:
_CCCL_ASSERT(__mapping_result.group_count() <= __barriers_.size(), "invalid number of barriers passed");
}

if (__mapping_result.is_valid() && __mapping_result.rank() == 0)
::cuda::std::size_t __nthread_in_unit = 1;
::cuda::std::size_t __thread_rank_in_unit = 0;
if constexpr (!::cuda::std::is_same_v<thread_level, _Unit>)
{
init(&__barriers_[__mapping_result.group_rank()], static_cast<::cuda::std::ptrdiff_t>(__mapping_result.count()));
__nthread_in_unit = gpu_thread.count(_Unit{}, __parent.hierarchy());
__thread_rank_in_unit = gpu_thread.rank(_Unit{}, __parent.hierarchy());
}

if (__mapping_result.is_valid() && __mapping_result.rank() == 0 && __thread_rank_in_unit == 0)
{
init(&__barriers_[__mapping_result.group_rank()],
static_cast<::cuda::std::ptrdiff_t>(__mapping_result.count() * __nthread_in_unit));
}

// todo(dabayer): How we can expose making this aligned?
Expand Down
3 changes: 3 additions & 0 deletions cudax/test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,9 @@ cudax_add_catch2_test(test_target coop.reduce.this_cluster
cudax_add_catch2_test(test_target coop.reduce.this_grid
coop/reduce/this_grid.cu
)
cudax_add_catch2_test(test_target coop.reduce.warps_within_block
coop/reduce/warps_within_block.cu
)

if (cudax_ENABLE_CUFILE)
cudax_add_catch2_test(test_target cufile.driver_attributes
Expand Down
192 changes: 192 additions & 0 deletions cudax/test/coop/reduce/warps_within_block.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#include <cuda/atomic>
#include <cuda/devices>
#include <cuda/functional>
#include <cuda/hierarchy>
#include <cuda/launch>
#include <cuda/std/algorithm>
#include <cuda/std/type_traits>
#include <cuda/stream>

#include <cuda/experimental/coop.cuh>
#include <cuda/experimental/group.cuh>

#include <testing.cuh>

#include <c2h/catch2_test_helper.h>
#include <c2h/extended_types.h>
#include <c2h/generators.h>
#include <catch2/matchers/catch_matchers_floating_point.hpp>

constexpr int nwarps_in_group = 3;
constexpr int warp_size = 32;

/***********************************************************************************************************************
* Thread Reduce Wrapper Kernels
**********************************************************************************************************************/

struct ReduceKernel
{
template <class Config, int NumItems, class T, class RedOp>
__device__ void operator()(
Config config,
cuda::std::integral_constant<int, NumItems>,
const T* __restrict__ d_in,
T* __restrict__ d_out,
RedOp red_op)
{
cudax::this_block block{config};

using Barriers = cuda::barrier<cuda::thread_scope_block>[1];
__shared__ cuda::std::aligned_storage_t<sizeof(Barriers), alignof(Barriers)> barriers_storage;
auto& barriers = reinterpret_cast<Barriers&>(barriers_storage);

cudax::group group{
cuda::warp, block, cudax::group_by<nwarps_in_group, false>{}, cudax::barrier_synchronizer{barriers}};

Comment thread
davebayer marked this conversation as resolved.
// All threads that are not part of the groups should exit early.
if (!cuda::gpu_thread.is_part_of(group))
{
return;
}

T thread_data[NumItems];
for (int i = 0; i < NumItems; ++i)
{
thread_data[i] = d_in[cuda::gpu_thread.rank_as<int>(group) + i * cuda::gpu_thread.count_as<int>(group)];
}
const auto result = cudax::coop::reduce(group, thread_data, red_op);

REQUIRE(result.has_value() == cuda::gpu_thread.is_root_rank(group));
if (cuda::gpu_thread.is_root_rank(group))
{
*d_out = result.value();
}
}
};

/***********************************************************************************************************************
* Type list definition
**********************************************************************************************************************/

using integral_type_list =
c2h::type_list<cuda::std::int8_t, cuda::std::int16_t, cuda::std::uint16_t, cuda::std::int32_t, cuda::std::int64_t>;

using fp_type_list = c2h::type_list<float, double>;

using operator_integral_list =
c2h::type_list<cuda::std::plus<>,
cuda::std::multiplies<>,
cuda::std::bit_and<>,
cuda::std::bit_or<>,
cuda::std::bit_xor<>,
cuda::minimum<>,
cuda::maximum<>>;

using operator_fp_list = c2h::type_list<cuda::std::plus<>, cuda::std::multiplies<>, cuda::minimum<>, cuda::maximum<>>;

/***********************************************************************************************************************
* Verify results and kernel launch
**********************************************************************************************************************/

template <class T>
void verify_results(const T& expected_data, const T& test_results)
{
if constexpr (cuda::std::is_floating_point_v<T>)
{
REQUIRE_THAT(expected_data, Catch::Matchers::WithinRel(test_results, T{0.05}));
}
else
{
REQUIRE(expected_data == test_results);
}
}

template <class T, class RedOp>
void run_thread_reduce_kernel(
cuda::stream_ref stream, int num_items, const c2h::device_vector<T>& in, c2h::device_vector<T>& out, RedOp red_op)
{
const auto config = cuda::make_config(cuda::grid_dims<1>(), cuda::block_dims<(nwarps_in_group + 2) * warp_size>());
const auto in_ptr = thrust::raw_pointer_cast(in.data());
const auto out_ptr = thrust::raw_pointer_cast(out.data());
const ReduceKernel kernel{};

switch (num_items)
{
case 1:
cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 1>{}, in_ptr, out_ptr, red_op);
break;
case 2:
cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 2>{}, in_ptr, out_ptr, red_op);
break;
case 3:
cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 3>{}, in_ptr, out_ptr, red_op);
break;
case 4:
cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 4>{}, in_ptr, out_ptr, red_op);
break;
default:
FAIL("Unsupported number of items");
}
stream.sync();
}

constexpr int max_size = 4;
constexpr int num_seeds = 10;

/***********************************************************************************************************************
* Test cases
**********************************************************************************************************************/

_CCCL_DIAG_SUPPRESS_MSVC(4244) // warning C4244: '=': conversion from 'int' to '_Tp', possible loss of data

C2H_TEST("reduce/this_warp Integral Type Tests", "[reduce][this_warp]", integral_type_list, operator_integral_list)
{
using value_t = c2h::get<0, TestType>;
using op_t = c2h::get<1, TestType>;
constexpr auto reduce_op = op_t{};
constexpr auto operator_identity = cuda::identity_element<op_t, value_t>();
CAPTURE(c2h::type_name<value_t>(), max_size, c2h::type_name<decltype(reduce_op)>());
c2h::device_vector<value_t> d_in(max_size * nwarps_in_group * warp_size);
c2h::device_vector<value_t> d_out(1);
c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits<value_t>::min());
c2h::host_vector<value_t> h_in = d_in;
cuda::stream stream{cuda::devices[0]};
for (int num_items = 1; num_items <= max_size; ++num_items)
{
auto reference_result = cuda::std::accumulate(
h_in.begin(), h_in.begin() + num_items * nwarps_in_group * warp_size, operator_identity, reduce_op);
run_thread_reduce_kernel(stream, num_items, d_in, d_out, reduce_op);
verify_results(reference_result, c2h::host_vector<value_t>(d_out)[0]);
}
}

C2H_TEST("reduce/this_warp Floating-Point Type Tests", "[reduce][this_warp]", fp_type_list, operator_fp_list)
{
using value_t = c2h::get<0, TestType>;
using op_t = c2h::get<1, TestType>;
constexpr auto reduce_op = op_t{};
const auto operator_identity = cuda::identity_element<op_t, value_t>();
CAPTURE(c2h::type_name<value_t>(), max_size, c2h::type_name<decltype(reduce_op)>());
c2h::device_vector<value_t> d_in(max_size * nwarps_in_group * warp_size);
c2h::device_vector<value_t> d_out(1);
c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits<value_t>::min());
c2h::host_vector<value_t> h_in = d_in;
cuda::stream stream{cuda::devices[0]};
for (int num_items = 1; num_items <= max_size; ++num_items)
{
auto reference_result = cuda::std::accumulate(
h_in.begin(), h_in.begin() + num_items * nwarps_in_group * warp_size, operator_identity, reduce_op);
run_thread_reduce_kernel(stream, num_items, d_in, d_out, reduce_op);
verify_results(reference_result, c2h::host_vector<value_t>(d_out)[0]);
}
}
Loading