diff --git a/cub/benchmarks/bench/segmented_reduce/variable_argmax.cu b/cub/benchmarks/bench/segmented_reduce/variable_argmax.cu new file mode 100644 index 00000000000..2ea9c0bebce --- /dev/null +++ b/cub/benchmarks/bench/segmented_reduce/variable_argmax.cu @@ -0,0 +1,45 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3 + +#include + +using value_types = nvbench::type_list; +using op_t = cub::detail::arg_max; +using some_offset_types = nvbench::type_list; + +// %RANGE% TUNE_S_ITEMS ipsw 1:16:1 +// %RANGE% TUNE_SW_THREADS_POW2 tpsw 1:4:1 +// %RANGE% TUNE_S_LOAD sld 0:2:1 +// %RANGE% TUNE_THREADS tpb 128:1024:128 + +#if !TUNE_BASE +# define TUNE_L_ITEMS 16 +# define TUNE_M_ITEMS 16 + +# define TUNE_MW_THREADS_POW2 (TUNE_SW_THREADS_POW2 + 1) + +# define TUNE_SW_THREADS (1 << TUNE_SW_THREADS_POW2) +# define TUNE_MW_THREADS (1 << TUNE_MW_THREADS_POW2) + +# define SMALL_SEGMENT_SIZE TUNE_S_ITEMS* TUNE_SW_THREADS +# define MEDIUM_SEGMENT_SIZE TUNE_M_ITEMS* TUNE_MW_THREADS +# define LARGE_SEGMENT_SIZE TUNE_L_ITEMS* TUNE_THREADS + +# if (LARGE_SEGMENT_SIZE <= SMALL_SEGMENT_SIZE) || (LARGE_SEGMENT_SIZE <= MEDIUM_SEGMENT_SIZE) +# error Large segment size must be larger than small and medium segment sizes +# endif + +# if (MEDIUM_SEGMENT_SIZE <= SMALL_SEGMENT_SIZE) +# error Medium segment size must be larger than small one +# endif + +# if TUNE_S_LOAD == 0 +# define TUNE_S_LOAD_MODIFIER cub::LOAD_DEFAULT +# elif TUNE_S_LOAD == 1 +# define TUNE_S_LOAD_MODIFIER cub::LOAD_LDG +# else // TUNE_S_LOAD == 2 +# define TUNE_S_LOAD_MODIFIER cub::LOAD_CA +# endif // TUNE_S_LOAD +#endif + +#include "variable_base.cuh" diff --git a/cub/benchmarks/bench/segmented_reduce/variable_base.cuh b/cub/benchmarks/bench/segmented_reduce/variable_base.cuh new file mode 100644 index 00000000000..ed775a990b4 --- /dev/null +++ b/cub/benchmarks/bench/segmented_reduce/variable_base.cuh @@ -0,0 +1,210 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3 + +#pragma once + +#include + +#include +#include + +#include + +#include + +#if !TUNE_BASE +template +struct policy_hub_t +{ + struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t> + { + static constexpr int items_per_vec_load = 1; + + static constexpr int small_threads_per_warp = TUNE_SW_THREADS; + static constexpr int medium_threads_per_warp = TUNE_MW_THREADS; + + static constexpr int nominal_4b_large_threads_per_block = TUNE_THREADS; + + static constexpr int nominal_4b_small_items_per_thread = TUNE_L_ITEMS; + static constexpr int nominal_4b_medium_items_per_thread = TUNE_M_ITEMS; + static constexpr int nominal_4b_large_items_per_thread = TUNE_S_ITEMS; + + using ReducePolicy = + cub::AgentReducePolicy; + + using SmallReducePolicy = + cub::AgentWarpReducePolicy; + + using MediumReducePolicy = + cub::AgentWarpReducePolicy; + }; + + using MaxPolicy = policy_t; +}; +#endif // !TUNE_BASE + +template +void variable_segmented_reduce(nvbench::state& state, nvbench::type_list) +{ + static constexpr bool is_argmin = std::is_same_v; + static constexpr bool is_argmax = std::is_same_v; + + using raw_input_it_t = const T*; + using output_t = cuda::std::conditional_t<(is_argmin || is_argmax), cuda::std::pair, T>; + using output_it_t = output_t*; + using accum_t = output_t; + using init_t = + cuda::std::conditional_t<(is_argmin || is_argmax), cub::detail::reduce::empty_problem_init_t, T>; + using offset_t = OffsetT; + using begin_offset_it_t = const offset_t*; + using end_offset_it_t = const offset_t*; + + // Retrieve axis parameters + const auto elements = static_cast(state.get_int64("Elements{io}")); + const auto max_segment_size = static_cast(state.get_int64("MaxSegmentSize")); + const auto guaranteed_max_seg_size = static_cast(state.get_int64("GuaranteeMaxSegSize")); + + // skip if max_segment_size > guaranteed_max_seg_size + if (guaranteed_max_seg_size != 0 && max_segment_size > guaranteed_max_seg_size) + { + state.skip("max_segment_size > guaranteed_max_seg_size"); + return; + } + + const auto min_segment_size = 1; + const auto max_segment_size_log = static_cast(std::log2(max_segment_size)); + + // Generate segment offsets + thrust::device_vector segment_offsets = + generate.uniform.segment_offsets(elements, min_segment_size, max_segment_size); + const auto num_segments = segment_offsets.size() - 1; + + // Generate input data + thrust::device_vector in = generate(elements); + thrust::device_vector out(num_segments); + + raw_input_it_t d_raw_in = thrust::raw_pointer_cast(in.data()); + output_it_t d_out = thrust::raw_pointer_cast(out.data()); + begin_offset_it_t d_begin_offsets = thrust::raw_pointer_cast(segment_offsets.data()); + end_offset_it_t d_end_offsets = d_begin_offsets + 1; + + // Create wrapped iterator for argmin/argmax operations + [[maybe_unused]] auto d_indexed_in = thrust::make_transform_iterator( + thrust::counting_iterator<::cuda::std::int64_t>{0}, + cub::detail::reduce::generate_idx_value(d_raw_in, 1)); + using arg_index_input_iterator_t = decltype(d_indexed_in); + + auto get_in = [&] { + if constexpr (is_argmin || is_argmax) + { + return d_indexed_in; + } + else + { + return d_raw_in; + } + }; + + using input_it_t = decltype(get_in()); + input_it_t d_in = get_in(); + + // Enable throughput calculations + state.add_element_count(elements); + state.add_global_memory_reads(elements, "Size"); + state.add_global_memory_writes(num_segments); + state.add_global_memory_reads(num_segments + 1); + + using dispatch_t = cub::DispatchSegmentedReduce< + input_it_t, + output_it_t, + begin_offset_it_t, + end_offset_it_t, + offset_t, + op_t, + init_t, + accum_t +#if !TUNE_BASE + , + policy_hub_t +#endif // TUNE_BASE + >; + + // Allocate temporary storage + std::size_t temp_size{}; + dispatch_t::Dispatch( + nullptr, + temp_size, + d_in, + d_out, + static_cast<::cuda::std::int64_t>(num_segments), + d_begin_offsets, + d_end_offsets, + op_t{}, + init_t{}, + 0 /* stream */, + guaranteed_max_seg_size); + + thrust::device_vector temp(temp_size); + auto* temp_storage = thrust::raw_pointer_cast(temp.data()); + + state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) { + dispatch_t::Dispatch( + temp_storage, + temp_size, + d_in, + d_out, + static_cast<::cuda::std::int64_t>(num_segments), + d_begin_offsets, + d_end_offsets, + op_t{}, + init_t{}, + launch.get_stream(), + guaranteed_max_seg_size); + }); +} + +NVBENCH_BENCH_TYPES(variable_segmented_reduce, NVBENCH_TYPE_AXES(value_types, some_offset_types)) + .set_name("variable_default") + .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) + .add_int64_power_of_two_axis("MaxSegmentSize", nvbench::range(1, 16, 1)) + .add_int64_axis("GuaranteeMaxSegSize", {0}); + +// Small segments: 1-16 items per segment +NVBENCH_BENCH_TYPES(variable_segmented_reduce, NVBENCH_TYPE_AXES(value_types, some_offset_types)) + .set_name("variable_small_dynamic") + .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) + .add_int64_power_of_two_axis("MaxSegmentSize", nvbench::range(1, 4, 1)) + .add_int64_power_of_two_axis("GuaranteeMaxSegSize", nvbench::range(1, 4, 1)); + +// Medium segments: 32-256 items per segment +NVBENCH_BENCH_TYPES(variable_segmented_reduce, NVBENCH_TYPE_AXES(value_types, some_offset_types)) + .set_name("variable_medium_dynamic") + .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) + .add_int64_power_of_two_axis("MaxSegmentSize", nvbench::range(5, 8, 1)) + .add_int64_power_of_two_axis("GuaranteeMaxSegSize", nvbench::range(5, 8, 1)); + +// Large segments: 512+ items per segment +NVBENCH_BENCH_TYPES(variable_segmented_reduce, NVBENCH_TYPE_AXES(value_types, some_offset_types)) + .set_name("variable_large_dynamic") + .set_type_axes_names({"T{ct}", "OffsetT{ct}"}) + .add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4)) + .add_int64_power_of_two_axis("MaxSegmentSize", nvbench::range(9, 16, 1)) + .add_int64_power_of_two_axis("GuaranteeMaxSegSize", nvbench::range(9, 16, 1)); diff --git a/cub/benchmarks/bench/segmented_reduce/variable_sum.cu b/cub/benchmarks/bench/segmented_reduce/variable_sum.cu new file mode 100644 index 00000000000..83d2252f1ea --- /dev/null +++ b/cub/benchmarks/bench/segmented_reduce/variable_sum.cu @@ -0,0 +1,10 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3 + +#include + +using value_types = nvbench::type_list; +using op_t = ::cuda::std::plus<>; +using some_offset_types = nvbench::type_list; + +#include "variable_base.cuh" diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh index 72f34e45ab0..ebaa08462fa 100644 --- a/cub/cub/device/device_segmented_reduce.cuh +++ b/cub/cub/device/device_segmented_reduce.cuh @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -69,7 +70,7 @@ struct tuning struct default_tuning : tuning { template - using fn = detail::reduce::policy_hub; + using fn = detail::fixed_size_segmented_reduce::policy_hub; }; } // namespace segmented_reduce } // namespace detail diff --git a/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh b/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh index 8c19aa94e3d..0fe4b4c5eb4 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh @@ -97,7 +97,7 @@ template >, typename AccumT = ::cuda::std::__accumulator_t, InitT>, - typename PolicyHub = detail::reduce::policy_hub, + typename PolicyHub = detail::fixed_size_segmented_reduce::policy_hub, typename KernelSource = detail::reduce::DeviceSegmentedReduceKernelSource< typename PolicyHub::MaxPolicy, InputIteratorT, @@ -154,6 +154,9 @@ struct DispatchSegmentedReduce /// CUDA stream to launch kernels within. Default is stream0. cudaStream_t stream; + /// The maximum segment size guarantee in the input segments + size_t max_segment_size; + int ptx_version; // Source getter @@ -178,6 +181,7 @@ struct DispatchSegmentedReduce InitT init, cudaStream_t stream, int ptx_version, + size_t max_segment_size = 0, KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}) : d_temp_storage(d_temp_storage) @@ -190,6 +194,7 @@ struct DispatchSegmentedReduce , reduction_op(reduction_op) , init(init) , stream(stream) + , max_segment_size(max_segment_size) , ptx_version(ptx_version) , kernel_source(kernel_source) , launcher_factory(launcher_factory) @@ -218,6 +223,12 @@ struct DispatchSegmentedReduce { cudaError error = cudaSuccess; + constexpr auto small_items_per_tile = ActivePolicyT::SmallReducePolicy::ITEMS_PER_TILE; + constexpr auto medium_items_per_tile = ActivePolicyT::MediumReducePolicy::ITEMS_PER_TILE; + + static_assert((small_items_per_tile < medium_items_per_tile), + "small items per tile must be less than medium items per tile"); + do { // Return if the caller is simply requesting the size of the storage @@ -228,11 +239,22 @@ struct DispatchSegmentedReduce return cudaSuccess; } + // assume large segment size problem + int segments_per_block = 1; + + if (max_segment_size != 0 && max_segment_size <= small_items_per_tile) // small segment size problem + { + segments_per_block = ActivePolicyT::SmallReducePolicy::SEGMENTS_PER_BLOCK; + } + else if (max_segment_size != 0 && max_segment_size <= medium_items_per_tile) // medium segment size problem + { + segments_per_block = ActivePolicyT::MediumReducePolicy::SEGMENTS_PER_BLOCK; + } + // Init kernel configuration (computes kernel occupancy) // maybe only used inside CUB_DEBUG_LOG code sections [[maybe_unused]] detail::KernelConfig segmented_reduce_config; - error = - CubDebug(segmented_reduce_config.Init(segmented_reduce_kernel, policy.SegmentedReduce(), launcher_factory)); + error = CubDebug(segmented_reduce_config.Init(segmented_reduce_kernel, policy.Reduce(), launcher_factory)); if (cudaSuccess != error) { break; @@ -244,9 +266,11 @@ struct DispatchSegmentedReduce for (::cuda::std::int64_t invocation_index = 0; invocation_index < num_invocations; invocation_index++) { - const auto current_seg_offset = invocation_index * num_segments_per_invocation; - const auto num_current_segments = - ::cuda::std::min(num_segments_per_invocation, num_segments - current_seg_offset); + const auto current_seg_offset = invocation_index * num_segments_per_invocation; + const auto num_current_segments = static_cast<::cuda::std::int32_t>( + ::cuda::std::min(num_segments_per_invocation, num_segments - current_seg_offset)); + + const auto num_current_blocks = ::cuda::ceil_div(num_current_segments, segments_per_block); // Log device_reduce_sweep_kernel configuration #ifdef CUB_DEBUG_LOG @@ -261,8 +285,16 @@ struct DispatchSegmentedReduce // Invoke DeviceSegmentedReduceKernel launcher_factory( - static_cast<::cuda::std::uint32_t>(num_current_segments), policy.SegmentedReduce().BlockThreads(), 0, stream) - .doit(segmented_reduce_kernel, d_in, d_out, d_begin_offsets, d_end_offsets, reduction_op, init); + static_cast<::cuda::std::uint32_t>(num_current_blocks), policy.Reduce().BlockThreads(), 0, stream) + .doit(segmented_reduce_kernel, + d_in, + d_out, + d_begin_offsets, + d_end_offsets, + num_current_segments, + reduction_op, + init, + max_segment_size); // Check for failure to launch error = CubDebug(cudaPeekAtLastError()); @@ -294,7 +326,7 @@ struct DispatchSegmentedReduce template CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t Invoke(ActivePolicyT policy = {}) { - auto wrapped_policy = detail::reduce::MakeReducePolicyWrapper(policy); + auto wrapped_policy = detail::reduce::MakeFixedSizeSegmentedReducePolicyWrapper(policy); // Force kernel code-generation in all compiler passes return InvokePasses(kernel_source.SegmentedReduceKernel(), wrapped_policy); } @@ -358,6 +390,7 @@ struct DispatchSegmentedReduce ReductionOpT reduction_op, InitT init, cudaStream_t stream, + size_t max_segment_size = 0, KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}, MaxPolicyT max_policy = {}) @@ -392,6 +425,7 @@ struct DispatchSegmentedReduce init, stream, ptx_version, + max_segment_size, kernel_source, launcher_factory); diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh index cc8182707c0..2efe673f851 100644 --- a/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh @@ -105,38 +105,133 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS) OutputIteratorT d_out, BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, + int num_segments, ReductionOpT reduction_op, - InitT init) + InitT init, + size_t max_segment_size) { + using ActivePolicyT = typename ChainedPolicyT::ActivePolicy; + // Thread block type for reducing input tiles - using AgentReduceT = - AgentReduce; + // Use OffsetT for offset calculations to support 64-bit offsets + using AgentReduceT = AgentReduce; - // Shared memory storage - __shared__ typename AgentReduceT::TempStorage temp_storage; + using AgentMediumReduceT = + AgentWarpReduce; - OffsetT segment_begin = d_begin_offsets[blockIdx.x]; - OffsetT segment_end = d_end_offsets[blockIdx.x]; + using AgentSmallReduceT = + AgentWarpReduce; - // Check if empty problem - if (segment_begin == segment_end) + constexpr auto segments_per_medium_block = ActivePolicyT::MediumReducePolicy::SEGMENTS_PER_BLOCK; + constexpr auto medium_threads_per_warp = ActivePolicyT::MediumReducePolicy::WARP_THREADS; + constexpr auto medium_items_per_tile = ActivePolicyT::MediumReducePolicy::ITEMS_PER_TILE; + + constexpr auto segments_per_small_block = ActivePolicyT::SmallReducePolicy::SEGMENTS_PER_BLOCK; + constexpr auto small_threads_per_warp = ActivePolicyT::SmallReducePolicy::WARP_THREADS; + constexpr auto small_items_per_tile = ActivePolicyT::SmallReducePolicy::ITEMS_PER_TILE; + + // Shared memory storage + __shared__ union { - if (threadIdx.x == 0) + typename AgentReduceT::TempStorage large_storage; + typename AgentMediumReduceT::TempStorage medium_storage[segments_per_medium_block]; + typename AgentSmallReduceT::TempStorage small_storage[segments_per_small_block]; + } temp_storage; + + const int bid = blockIdx.x; + const int tid = threadIdx.x; + + auto small_medium_reduction = + [&](auto agent_tag, auto& storage, auto threads_per_warp_tag, auto segments_per_block_tag) { + using AgentWarpReduceT = typename decltype(agent_tag)::type; + constexpr int threads_per_warp = decltype(threads_per_warp_tag)::value; + constexpr int segments_per_block = decltype(segments_per_block_tag)::value; + const int sid_within_block = tid / threads_per_warp; + const int lane_id = tid % threads_per_warp; + const int global_segment_id = bid * segments_per_block + sid_within_block; + + if (global_segment_id < num_segments) + { + const auto segment_begin = static_cast(d_begin_offsets[global_segment_id]); + const auto segment_end = static_cast(d_end_offsets[global_segment_id]); + + // If empty segment, write out the initial value + if (segment_begin == segment_end) + { + if (lane_id == 0) + { + *(d_out + global_segment_id) = detail::reduce::unwrap_empty_problem_init(init); + } + return; + } + // Consume input tiles + AccumT warp_aggregate = + AgentWarpReduceT(storage[sid_within_block], d_in, reduction_op).ConsumeRange(segment_begin, segment_end); + + // Normalize as needed + NormalizeReductionOutput(warp_aggregate, segment_begin, d_in); + + if (lane_id == 0) + { + finalize_and_store_aggregate(d_out + global_segment_id, reduction_op, init, warp_aggregate); + } + } + }; + + auto small_segment_reduction = [&]() { + small_medium_reduction( + ::cuda::std::type_identity{}, + temp_storage.small_storage, + ::cuda::std::integral_constant{}, + ::cuda::std::integral_constant{}); + }; + + auto medium_segment_reduction = [&]() { + small_medium_reduction( + ::cuda::std::type_identity{}, + temp_storage.medium_storage, + ::cuda::std::integral_constant{}, + ::cuda::std::integral_constant{}); + }; + + auto large_segment_reduction = [&]() { + const auto segment_begin = static_cast(d_begin_offsets[bid]); + const auto segment_end = static_cast(d_end_offsets[bid]); + + // If empty segment, write out the initial value + if (segment_begin == segment_end) { - *(d_out + blockIdx.x) = init; + if (tid == 0) + { + *(d_out + bid) = detail::reduce::unwrap_empty_problem_init(init); + } + return; } - return; - } - // Consume input tiles - AccumT block_aggregate = AgentReduceT(temp_storage, d_in, reduction_op).ConsumeRange(segment_begin, segment_end); + // Consume input tiles + AccumT block_aggregate = + AgentReduceT(temp_storage.large_storage, d_in, reduction_op).ConsumeRange(segment_begin, segment_end); + + // Normalize as needed + NormalizeReductionOutput(block_aggregate, segment_begin, d_in); - // Normalize as needed - NormalizeReductionOutput(block_aggregate, segment_begin, d_in); + if (tid == 0) + { + finalize_and_store_aggregate(d_out + bid, reduction_op, init, block_aggregate); + } + }; - if (threadIdx.x == 0) + if (max_segment_size != 0 && max_segment_size <= small_items_per_tile) + { + small_segment_reduction(); + } + else if (max_segment_size != 0 && max_segment_size <= medium_items_per_tile) + { + medium_segment_reduction(); + } + else { - finalize_and_store_aggregate(d_out + blockIdx.x, reduction_op, init, block_aggregate); + large_segment_reduction(); } } diff --git a/cub/cub/device/dispatch/tuning/tuning_reduce.cuh b/cub/cub/device/dispatch/tuning/tuning_reduce.cuh index 5cf4f6ed375..13d796c0b5e 100644 --- a/cub/cub/device/dispatch/tuning/tuning_reduce.cuh +++ b/cub/cub/device/dispatch/tuning/tuning_reduce.cuh @@ -147,6 +147,37 @@ _CCCL_HOST_DEVICE ReducePolicyWrapper MakeReducePolicyWrapper(PolicyT p return ReducePolicyWrapper{policy}; } +template +struct FixedSizeSegmentedReducePolicyWrapper : PolicyT +{ + _CCCL_HOST_DEVICE FixedSizeSegmentedReducePolicyWrapper(PolicyT base) + : PolicyT(base) + {} +}; + +template +struct FixedSizeSegmentedReducePolicyWrapper> + : StaticPolicyT +{ + _CCCL_HOST_DEVICE FixedSizeSegmentedReducePolicyWrapper(StaticPolicyT base) + : StaticPolicyT(base) + {} + + CUB_DEFINE_SUB_POLICY_GETTER(Reduce) + CUB_DEFINE_SUB_POLICY_GETTER(SmallReduce) + CUB_DEFINE_SUB_POLICY_GETTER(MediumReduce) +}; + +template +_CCCL_HOST_DEVICE FixedSizeSegmentedReducePolicyWrapper +MakeFixedSizeSegmentedReducePolicyWrapper(PolicyT policy) +{ + return FixedSizeSegmentedReducePolicyWrapper{policy}; +} + enum class offset_size { _4, diff --git a/cub/test/catch2_test_device_segmented_reduce_max_seg_size.cu b/cub/test/catch2_test_device_segmented_reduce_max_seg_size.cu new file mode 100644 index 00000000000..2148966e3f4 --- /dev/null +++ b/cub/test/catch2_test_device_segmented_reduce_max_seg_size.cu @@ -0,0 +1,80 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved. +// SPDX-License-Identifier: BSD-3-Clause + +#include "insert_nested_NVTX_range_guard.h" + +#include + +#include + +#include "catch2_test_device_reduce.cuh" +#include "catch2_test_launch_helper.h" +#include + +DECLARE_LAUNCH_WRAPPER(cub::DeviceSegmentedReduce::Sum, device_segmented_sum); + +// %PARAM% TEST_LAUNCH lid 0 + +using full_type_list = c2h::type_list; +using offsets = c2h::type_list; + +C2H_TEST("Device segmented reduce works with dynamic max segment sizes", + "[segmented][reduce][device]", + full_type_list, + offsets) +{ + using input_t = typename c2h::get<0, TestType>; + using output_t = input_t; + using offset_t = typename c2h::get<1, TestType>; + + constexpr int min_items = 1; + constexpr int max_items = 10000; + + // Number of items + // Use c2h::adjust_seed_count to reduce runtime on sanitizers. + const int num_items = GENERATE_COPY( + take(c2h::adjust_seed_count(2), random(min_items, max_items)), + values({ + min_items, + max_items, + })); + INFO("Test num_items: " << num_items); + + const size_t max_seg_size = GENERATE(5, 10, 100, 1000, 10000); + + // Range of segment sizes to generate + // Note that the segment range [0, 1] may also include one last segment with more than 1 items + const std::tuple seg_size_range = + GENERATE_COPY(table({{0, 1}, {1, max_seg_size}, {max_seg_size, max_seg_size}})); + INFO("Test seg_size_range: [" << std::get<0>(seg_size_range) << ", " << std::get<1>(seg_size_range) << "]"); + + // Generate input segments + c2h::device_vector segment_offsets = c2h::gen_uniform_offsets( + C2H_SEED(1), num_items, std::get<0>(seg_size_range), std::get<1>(seg_size_range)); + const offset_t num_segments = static_cast(segment_offsets.size() - 1); + auto d_offsets_it = thrust::raw_pointer_cast(segment_offsets.data()); + + // Generate input data + c2h::device_vector in_items(num_items); + c2h::gen(C2H_SEED(2), in_items); + auto d_in_it = thrust::raw_pointer_cast(in_items.data()); + + SECTION("sum") + { + using op_t = cuda::std::plus<>; + using accum_t = cuda::std::__accumulator_t; + + // Prepare verification data + c2h::host_vector expected_result(num_segments); + compute_segmented_problem_reference(in_items, segment_offsets, op_t{}, accum_t{}, expected_result.begin()); + + // Run test + c2h::device_vector out_result(num_segments); + auto d_out_it = unwrap_it(thrust::raw_pointer_cast(out_result.data())); + + device_segmented_sum(d_in_it, d_out_it, num_segments, d_offsets_it, d_offsets_it + 1); + + // Verify result + REQUIRE(expected_result == out_result); + } +}