Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 45 additions & 0 deletions cub/benchmarks/bench/segmented_reduce/variable_argmax.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3

#include <nvbench_helper.cuh>

using value_types = nvbench::type_list<int32_t, int64_t, float, double>;
using op_t = cub::detail::arg_max;
using some_offset_types = nvbench::type_list<int32_t>;

// %RANGE% TUNE_S_ITEMS ipsw 1:16:1
// %RANGE% TUNE_SW_THREADS_POW2 tpsw 1:4:1
// %RANGE% TUNE_S_LOAD sld 0:2:1
// %RANGE% TUNE_THREADS tpb 128:1024:128

#if !TUNE_BASE
# define TUNE_L_ITEMS 16
# define TUNE_M_ITEMS 16

# define TUNE_MW_THREADS_POW2 (TUNE_SW_THREADS_POW2 + 1)

# define TUNE_SW_THREADS (1 << TUNE_SW_THREADS_POW2)
# define TUNE_MW_THREADS (1 << TUNE_MW_THREADS_POW2)

# define SMALL_SEGMENT_SIZE TUNE_S_ITEMS* TUNE_SW_THREADS
# define MEDIUM_SEGMENT_SIZE TUNE_M_ITEMS* TUNE_MW_THREADS
# define LARGE_SEGMENT_SIZE TUNE_L_ITEMS* TUNE_THREADS

# if (LARGE_SEGMENT_SIZE <= SMALL_SEGMENT_SIZE) || (LARGE_SEGMENT_SIZE <= MEDIUM_SEGMENT_SIZE)
# error Large segment size must be larger than small and medium segment sizes
# endif

# if (MEDIUM_SEGMENT_SIZE <= SMALL_SEGMENT_SIZE)
# error Medium segment size must be larger than small one
# endif

# if TUNE_S_LOAD == 0
# define TUNE_S_LOAD_MODIFIER cub::LOAD_DEFAULT
# elif TUNE_S_LOAD == 1
# define TUNE_S_LOAD_MODIFIER cub::LOAD_LDG
# else // TUNE_S_LOAD == 2
# define TUNE_S_LOAD_MODIFIER cub::LOAD_CA
# endif // TUNE_S_LOAD
#endif

#include "variable_base.cuh"
210 changes: 210 additions & 0 deletions cub/benchmarks/bench/segmented_reduce/variable_base.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,210 @@
// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3

#pragma once

#include <cub/device/dispatch/dispatch_segmented_reduce.cuh>

#include <thrust/iterator/counting_iterator.h>
#include <thrust/iterator/transform_iterator.h>

#include <cuda/std/type_traits>

#include <nvbench_helper.cuh>

#if !TUNE_BASE
template <typename AccumT>
struct policy_hub_t
{
struct policy_t : cub::ChainedPolicy<300, policy_t, policy_t>
{
static constexpr int items_per_vec_load = 1;

static constexpr int small_threads_per_warp = TUNE_SW_THREADS;
static constexpr int medium_threads_per_warp = TUNE_MW_THREADS;

static constexpr int nominal_4b_large_threads_per_block = TUNE_THREADS;

static constexpr int nominal_4b_small_items_per_thread = TUNE_L_ITEMS;
static constexpr int nominal_4b_medium_items_per_thread = TUNE_M_ITEMS;
static constexpr int nominal_4b_large_items_per_thread = TUNE_S_ITEMS;

using ReducePolicy =
cub::AgentReducePolicy<nominal_4b_large_threads_per_block,
nominal_4b_large_items_per_thread,
AccumT,
items_per_vec_load,
cub::BLOCK_REDUCE_WARP_REDUCTIONS,
cub::LOAD_LDG>;

using SmallReducePolicy =
cub::AgentWarpReducePolicy<ReducePolicy::BLOCK_THREADS,
small_threads_per_warp,
nominal_4b_small_items_per_thread,
AccumT,
items_per_vec_load,
TUNE_S_LOAD_MODIFIER>;

using MediumReducePolicy =
cub::AgentWarpReducePolicy<ReducePolicy::BLOCK_THREADS,
medium_threads_per_warp,
nominal_4b_medium_items_per_thread,
AccumT,
items_per_vec_load,
cub::LOAD_LDG>;
};

using MaxPolicy = policy_t;
};
#endif // !TUNE_BASE

template <typename T, typename OffsetT>
void variable_segmented_reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
static constexpr bool is_argmin = std::is_same_v<op_t, cub::detail::arg_min>;
static constexpr bool is_argmax = std::is_same_v<op_t, cub::detail::arg_max>;

using raw_input_it_t = const T*;
using output_t = cuda::std::conditional_t<(is_argmin || is_argmax), cuda::std::pair<int, T>, T>;
using output_it_t = output_t*;
using accum_t = output_t;
using init_t =
cuda::std::conditional_t<(is_argmin || is_argmax), cub::detail::reduce::empty_problem_init_t<accum_t>, T>;
using offset_t = OffsetT;
using begin_offset_it_t = const offset_t*;
using end_offset_it_t = const offset_t*;

// Retrieve axis parameters
const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
const auto max_segment_size = static_cast<std::size_t>(state.get_int64("MaxSegmentSize"));
const auto guaranteed_max_seg_size = static_cast<std::size_t>(state.get_int64("GuaranteeMaxSegSize"));

// skip if max_segment_size > guaranteed_max_seg_size
if (guaranteed_max_seg_size != 0 && max_segment_size > guaranteed_max_seg_size)
{
state.skip("max_segment_size > guaranteed_max_seg_size");
return;
}

const auto min_segment_size = 1;
const auto max_segment_size_log = static_cast<offset_t>(std::log2(max_segment_size));

// Generate segment offsets
thrust::device_vector<offset_t> segment_offsets =
generate.uniform.segment_offsets(elements, min_segment_size, max_segment_size);
const auto num_segments = segment_offsets.size() - 1;

// Generate input data
thrust::device_vector<T> in = generate(elements);
thrust::device_vector<output_t> out(num_segments);

raw_input_it_t d_raw_in = thrust::raw_pointer_cast(in.data());
output_it_t d_out = thrust::raw_pointer_cast(out.data());
begin_offset_it_t d_begin_offsets = thrust::raw_pointer_cast(segment_offsets.data());
end_offset_it_t d_end_offsets = d_begin_offsets + 1;

// Create wrapped iterator for argmin/argmax operations
[[maybe_unused]] auto d_indexed_in = thrust::make_transform_iterator(
thrust::counting_iterator<::cuda::std::int64_t>{0},
cub::detail::reduce::generate_idx_value<raw_input_it_t, T>(d_raw_in, 1));
using arg_index_input_iterator_t = decltype(d_indexed_in);

auto get_in = [&] {
if constexpr (is_argmin || is_argmax)
{
return d_indexed_in;
}
else
{
return d_raw_in;
}
};

using input_it_t = decltype(get_in());
input_it_t d_in = get_in();

// Enable throughput calculations
state.add_element_count(elements);
state.add_global_memory_reads<T>(elements, "Size");
state.add_global_memory_writes<output_t>(num_segments);
state.add_global_memory_reads<offset_t>(num_segments + 1);

using dispatch_t = cub::DispatchSegmentedReduce<
input_it_t,
output_it_t,
begin_offset_it_t,
end_offset_it_t,
offset_t,
op_t,
init_t,
accum_t
#if !TUNE_BASE
,
policy_hub_t<accum_t>
#endif // TUNE_BASE
>;

// Allocate temporary storage
std::size_t temp_size{};
dispatch_t::Dispatch(
nullptr,
temp_size,
d_in,
d_out,
static_cast<::cuda::std::int64_t>(num_segments),
d_begin_offsets,
d_end_offsets,
op_t{},
init_t{},
0 /* stream */,
guaranteed_max_seg_size);

thrust::device_vector<nvbench::uint8_t> temp(temp_size);
auto* temp_storage = thrust::raw_pointer_cast(temp.data());

state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch, [&](nvbench::launch& launch) {
dispatch_t::Dispatch(
temp_storage,
temp_size,
d_in,
d_out,
static_cast<::cuda::std::int64_t>(num_segments),
d_begin_offsets,
d_end_offsets,
op_t{},
init_t{},
launch.get_stream(),
guaranteed_max_seg_size);
});
}

NVBENCH_BENCH_TYPES(variable_segmented_reduce, NVBENCH_TYPE_AXES(value_types, some_offset_types))
.set_name("variable_default")
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_int64_power_of_two_axis("MaxSegmentSize", nvbench::range(1, 16, 1))
.add_int64_axis("GuaranteeMaxSegSize", {0});

// Small segments: 1-16 items per segment
NVBENCH_BENCH_TYPES(variable_segmented_reduce, NVBENCH_TYPE_AXES(value_types, some_offset_types))
.set_name("variable_small_dynamic")
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_int64_power_of_two_axis("MaxSegmentSize", nvbench::range(1, 4, 1))
.add_int64_power_of_two_axis("GuaranteeMaxSegSize", nvbench::range(1, 4, 1));

// Medium segments: 32-256 items per segment
NVBENCH_BENCH_TYPES(variable_segmented_reduce, NVBENCH_TYPE_AXES(value_types, some_offset_types))
.set_name("variable_medium_dynamic")
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_int64_power_of_two_axis("MaxSegmentSize", nvbench::range(5, 8, 1))
.add_int64_power_of_two_axis("GuaranteeMaxSegSize", nvbench::range(5, 8, 1));

// Large segments: 512+ items per segment
NVBENCH_BENCH_TYPES(variable_segmented_reduce, NVBENCH_TYPE_AXES(value_types, some_offset_types))
.set_name("variable_large_dynamic")
.set_type_axes_names({"T{ct}", "OffsetT{ct}"})
.add_int64_power_of_two_axis("Elements{io}", nvbench::range(16, 28, 4))
.add_int64_power_of_two_axis("MaxSegmentSize", nvbench::range(9, 16, 1))
.add_int64_power_of_two_axis("GuaranteeMaxSegSize", nvbench::range(9, 16, 1));
10 changes: 10 additions & 0 deletions cub/benchmarks/bench/segmented_reduce/variable_sum.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
// SPDX-License-Identifier: BSD-3

#include <nvbench_helper.cuh>

using value_types = nvbench::type_list<int32_t, int64_t, float, double>;
using op_t = ::cuda::std::plus<>;
using some_offset_types = nvbench::type_list<int32_t>;

#include "variable_base.cuh"
3 changes: 2 additions & 1 deletion cub/cub/device/device_segmented_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <cub/detail/temporary_storage.cuh>
#include <cub/device/dispatch/dispatch_fixed_size_segmented_reduce.cuh>
#include <cub/device/dispatch/dispatch_segmented_reduce.cuh>
#include <cub/device/dispatch/tuning/tuning_reduce.cuh>
#include <cub/iterator/arg_index_input_iterator.cuh>
#include <cub/util_type.cuh>

Expand Down Expand Up @@ -69,7 +70,7 @@ struct tuning
struct default_tuning : tuning<default_tuning>
{
template <class AccumT, class Offset, class OpT>
using fn = detail::reduce::policy_hub<AccumT, Offset, OpT>;
using fn = detail::fixed_size_segmented_reduce::policy_hub<AccumT, Offset, OpT>;
};
} // namespace segmented_reduce
} // namespace detail
Expand Down
Loading
Loading