NVIDIA · davebayer · Jun 4, 2026 · Jun 5, 2026
@@ -196,9 +196,51 @@ __reduce_impl(this_grid<_Hierarchy> __group, _Tp (&__thread_data)[_Np], _RedFn _
   return ::cuda::std::nullopt;
 }
 
+_CCCL_TEMPLATE(class _Group, class _Tp, ::cuda::std::size_t _Np, class _RedFn)
+_CCCL_REQUIRES(::cuda::std::is_same_v<warp_level, typename _Group::unit_type>
+                 _CCCL_AND ::cuda::std::is_same_v<block_level, typename _Group::level_type>)
+[[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp>
+__reduce_impl(_Group __group, _Tp (&__thread_data)[_Np], _RedFn __red_fn)
+{
+  constexpr auto __nwarps_in_group = warp.static_count(__group);
+  static_assert(__nwarps_in_group != ::cuda::std::dynamic_extent,
+                "cuda::coop::reduce requires the group to have statically known size");
+
+  using _WarpReduce = ::cub::WarpReduce<_Tp>;
+  union _Scratch
+  {
+    typename _WarpReduce::TempStorage __warp_reduce_[__nwarps_in_group];
+    _Tp __partials_[__nwarps_in_group];
+  };
+  __shared__ _Scratch __scratch;
+
+  const auto __partial = _WarpReduce{__scratch.__warp_reduce_[warp.rank(__group)]}.Reduce(__thread_data, __red_fn);
+  __group.sync_aligned();
+
+  this_warp __warp{__group.hierarchy()};
+  if (gpu_thread.is_root_rank(__warp))
+  {
+    __scratch.__partials_[warp.rank(__group)] = __partial;
+  }
+  __group.sync_aligned();
+
+  if (warp.is_root_rank(__group))
+  {
+    const auto __value  = (gpu_thread.rank(__warp) < __nwarps_in_group)
+                          ? __scratch.__partials_[gpu_thread.rank(__warp)]
+                          : ::cuda::identity_element<_RedFn, _Tp>();
+    const auto __result = _WarpReduce{__scratch.__warp_reduce_[0]}.Reduce(__value, __red_fn);
+    if (gpu_thread.is_root_rank(__warp))
+    {
+      return ::cuda::std::optional{__result};
+    }
+  }
+  return ::cuda::std::nullopt;
+}
+
 template <class _Group, class _Tp, ::cuda::std::size_t _Np, class _RedFn>
 [[nodiscard]] _CCCL_DEVICE_API ::cuda::std::optional<_Tp>
-reduce(_Group __group, _Tp (&__thread_data)[_Np], _RedFn&& __red_fn)
+reduce(_Group __group, _Tp (&__thread_data)[_Np], _RedFn __red_fn)
 {
   static_assert(gpu_thread.static_count(__group) != ::cuda::std::dynamic_extent,
                 "cuda::coop::reduce requires the group to have statically known size");

@@ -103,9 +103,9 @@ template <class _Tp, class _Unit, class _Group>
   }
   else
   {
-    const auto __unit_rank        = __rank_query<_Unit, _GroupUnit>::template __call<_Tp>(__group.hierarchy());
-    const auto __group_unit_count = ::cuda::experimental::__count_query_group<_Tp, _Unit>(__group);
-    return static_cast<_Tp>(__group_unit_rank * __group_unit_count + __unit_rank);
+    const auto __unit_rank  = __rank_query<_Unit, _GroupUnit>::template __call<_Tp>(__group.hierarchy());
+    const auto __unit_count = __count_query<_Unit, _GroupUnit>::template __call<_Tp>(__group.hierarchy());
+    return static_cast<_Tp>(__group_unit_rank * __unit_count + __unit_rank);
   }
 }
 

@@ -123,9 +123,18 @@ public:
       _CCCL_ASSERT(__mapping_result.group_count() <= __barriers_.size(), "invalid number of barriers passed");
     }
 
-    if (__mapping_result.is_valid() && __mapping_result.rank() == 0)
+    ::cuda::std::size_t __nthread_in_unit     = 1;
+    ::cuda::std::size_t __thread_rank_in_unit = 0;
+    if constexpr (!::cuda::std::is_same_v<thread_level, _Unit>)
     {
-      init(&__barriers_[__mapping_result.group_rank()], static_cast<::cuda::std::ptrdiff_t>(__mapping_result.count()));
+      __nthread_in_unit     = gpu_thread.count(_Unit{}, __parent.hierarchy());
+      __thread_rank_in_unit = gpu_thread.rank(_Unit{}, __parent.hierarchy());
+    }
+
+    if (__mapping_result.is_valid() && __mapping_result.rank() == 0 && __thread_rank_in_unit == 0)
+    {
+      init(&__barriers_[__mapping_result.group_rank()],
+           static_cast<::cuda::std::ptrdiff_t>(__mapping_result.count() * __nthread_in_unit));
     }
 
     // todo(dabayer): How we can expose making this aligned?

@@ -193,6 +193,9 @@ cudax_add_catch2_test(test_target coop.reduce.this_cluster
 cudax_add_catch2_test(test_target coop.reduce.this_grid
     coop/reduce/this_grid.cu
 )
+cudax_add_catch2_test(test_target coop.reduce.warps_within_block
+    coop/reduce/warps_within_block.cu
+)
 
 if (cudax_ENABLE_CUFILE)
   cudax_add_catch2_test(test_target cufile.driver_attributes

@@ -0,0 +1,192 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDA Experimental in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/atomic>
+#include <cuda/devices>
+#include <cuda/functional>
+#include <cuda/hierarchy>
+#include <cuda/launch>
+#include <cuda/std/algorithm>
+#include <cuda/std/type_traits>
+#include <cuda/stream>
+
+#include <cuda/experimental/coop.cuh>
+#include <cuda/experimental/group.cuh>
+
+#include <testing.cuh>
+
+#include <c2h/catch2_test_helper.h>
+#include <c2h/extended_types.h>
+#include <c2h/generators.h>
+#include <catch2/matchers/catch_matchers_floating_point.hpp>
+
+constexpr int nwarps_in_group = 3;
+constexpr int warp_size       = 32;
+
+/***********************************************************************************************************************
+ * Thread Reduce Wrapper Kernels
+ **********************************************************************************************************************/
+
+struct ReduceKernel
+{
+  template <class Config, int NumItems, class T, class RedOp>
+  __device__ void operator()(
+    Config config,
+    cuda::std::integral_constant<int, NumItems>,
+    const T* __restrict__ d_in,
+    T* __restrict__ d_out,
+    RedOp red_op)
+  {
+    cudax::this_block block{config};
+
+    using Barriers = cuda::barrier<cuda::thread_scope_block>[1];
+    __shared__ cuda::std::aligned_storage_t<sizeof(Barriers), alignof(Barriers)> barriers_storage;
+    auto& barriers = reinterpret_cast<Barriers&>(barriers_storage);
+
+    cudax::group group{
+      cuda::warp, block, cudax::group_by<nwarps_in_group, false>{}, cudax::barrier_synchronizer{barriers}};
+
+    // All threads that are not part of the groups should exit early.
+    if (!cuda::gpu_thread.is_part_of(group))
+    {
+      return;
+    }
+
+    T thread_data[NumItems];
+    for (int i = 0; i < NumItems; ++i)
+    {
+      thread_data[i] = d_in[cuda::gpu_thread.rank_as<int>(group) + i * cuda::gpu_thread.count_as<int>(group)];
+    }
+    const auto result = cudax::coop::reduce(group, thread_data, red_op);
+
+    REQUIRE(result.has_value() == cuda::gpu_thread.is_root_rank(group));
+    if (cuda::gpu_thread.is_root_rank(group))
+    {
+      *d_out = result.value();
+    }
+  }
+};
+
+/***********************************************************************************************************************
+ * Type list definition
+ **********************************************************************************************************************/
+
+using integral_type_list =
+  c2h::type_list<cuda::std::int8_t, cuda::std::int16_t, cuda::std::uint16_t, cuda::std::int32_t, cuda::std::int64_t>;
+
+using fp_type_list = c2h::type_list<float, double>;
+
+using operator_integral_list =
+  c2h::type_list<cuda::std::plus<>,
+                 cuda::std::multiplies<>,
+                 cuda::std::bit_and<>,
+                 cuda::std::bit_or<>,
+                 cuda::std::bit_xor<>,
+                 cuda::minimum<>,
+                 cuda::maximum<>>;
+
+using operator_fp_list = c2h::type_list<cuda::std::plus<>, cuda::std::multiplies<>, cuda::minimum<>, cuda::maximum<>>;
+
+/***********************************************************************************************************************
+ * Verify results and kernel launch
+ **********************************************************************************************************************/
+
+template <class T>
+void verify_results(const T& expected_data, const T& test_results)
+{
+  if constexpr (cuda::std::is_floating_point_v<T>)
+  {
+    REQUIRE_THAT(expected_data, Catch::Matchers::WithinRel(test_results, T{0.05}));
+  }
+  else
+  {
+    REQUIRE(expected_data == test_results);
+  }
+}
+
+template <class T, class RedOp>
+void run_thread_reduce_kernel(
+  cuda::stream_ref stream, int num_items, const c2h::device_vector<T>& in, c2h::device_vector<T>& out, RedOp red_op)
+{
+  const auto config  = cuda::make_config(cuda::grid_dims<1>(), cuda::block_dims<(nwarps_in_group + 2) * warp_size>());
+  const auto in_ptr  = thrust::raw_pointer_cast(in.data());
+  const auto out_ptr = thrust::raw_pointer_cast(out.data());
+  const ReduceKernel kernel{};
+
+  switch (num_items)
+  {
+    case 1:
+      cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 1>{}, in_ptr, out_ptr, red_op);
+      break;
+    case 2:
+      cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 2>{}, in_ptr, out_ptr, red_op);
+      break;
+    case 3:
+      cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 3>{}, in_ptr, out_ptr, red_op);
+      break;
+    case 4:
+      cuda::launch(stream, config, kernel, cuda::std::integral_constant<int, 4>{}, in_ptr, out_ptr, red_op);
+      break;
+    default:
+      FAIL("Unsupported number of items");
+  }
+  stream.sync();
+}
+
+constexpr int max_size  = 4;
+constexpr int num_seeds = 10;
+
+/***********************************************************************************************************************
+ * Test cases
+ **********************************************************************************************************************/
+
+_CCCL_DIAG_SUPPRESS_MSVC(4244) // warning C4244: '=': conversion from 'int' to '_Tp', possible loss of data
+
+C2H_TEST("reduce/this_warp Integral Type Tests", "[reduce][this_warp]", integral_type_list, operator_integral_list)
+{
+  using value_t                    = c2h::get<0, TestType>;
+  using op_t                       = c2h::get<1, TestType>;
+  constexpr auto reduce_op         = op_t{};
+  constexpr auto operator_identity = cuda::identity_element<op_t, value_t>();
+  CAPTURE(c2h::type_name<value_t>(), max_size, c2h::type_name<decltype(reduce_op)>());
+  c2h::device_vector<value_t> d_in(max_size * nwarps_in_group * warp_size);
+  c2h::device_vector<value_t> d_out(1);
+  c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits<value_t>::min());
+  c2h::host_vector<value_t> h_in = d_in;
+  cuda::stream stream{cuda::devices[0]};
+  for (int num_items = 1; num_items <= max_size; ++num_items)
+  {
+    auto reference_result = cuda::std::accumulate(
+      h_in.begin(), h_in.begin() + num_items * nwarps_in_group * warp_size, operator_identity, reduce_op);
+    run_thread_reduce_kernel(stream, num_items, d_in, d_out, reduce_op);
+    verify_results(reference_result, c2h::host_vector<value_t>(d_out)[0]);
+  }
+}
+
+C2H_TEST("reduce/this_warp Floating-Point Type Tests", "[reduce][this_warp]", fp_type_list, operator_fp_list)
+{
+  using value_t                = c2h::get<0, TestType>;
+  using op_t                   = c2h::get<1, TestType>;
+  constexpr auto reduce_op     = op_t{};
+  const auto operator_identity = cuda::identity_element<op_t, value_t>();
+  CAPTURE(c2h::type_name<value_t>(), max_size, c2h::type_name<decltype(reduce_op)>());
+  c2h::device_vector<value_t> d_in(max_size * nwarps_in_group * warp_size);
+  c2h::device_vector<value_t> d_out(1);
+  c2h::gen(C2H_SEED(num_seeds), d_in, cuda::std::numeric_limits<value_t>::min());
+  c2h::host_vector<value_t> h_in = d_in;
+  cuda::stream stream{cuda::devices[0]};
+  for (int num_items = 1; num_items <= max_size; ++num_items)
+  {
+    auto reference_result = cuda::std::accumulate(
+      h_in.begin(), h_in.begin() + num_items * nwarps_in_group * warp_size, operator_identity, reduce_op);
+    run_thread_reduce_kernel(stream, num_items, d_in, d_out, reduce_op);
+    verify_results(reference_result, c2h::host_vector<value_t>(d_out)[0]);
+  }
+}