Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions cub/benchmarks/bench/reduce/base.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ struct policy_selector
template <typename T, typename OffsetT>
void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
using init_t = T;
using init_value_t = T;

// Retrieve axis parameters
const auto elements = state.get_int64("Elements{io}");
Expand All @@ -49,11 +49,18 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
launch
#if !TUNE_BASE
,
cuda::execution::tune(policy_selector<cuda::std::__accumulator_t<op_t, T, init_t>>{})
cuda::execution::tune(policy_selector<cuda::std::__accumulator_t<op_t, T, init_value_t>>{})
#endif // !TUNE_BASE
);
_CCCL_TRY_CUDA_API(
cub::DeviceReduce::Reduce, "Reduce failed", d_in, d_out, static_cast<OffsetT>(elements), op_t{}, init_t{}, env);
cub::DeviceReduce::Reduce,
"Reduce failed",
d_in,
d_out,
static_cast<OffsetT>(elements),
op_t{},
init_value_t{},
env);
});
}

Expand Down
4 changes: 2 additions & 2 deletions cub/benchmarks/bench/reduce/deterministic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ struct policy_selector_t
template <class T>
void deterministic_sum(nvbench::state& state, nvbench::type_list<T>)
{
using init_t = T;
using init_value_t = T;

const auto elements = static_cast<int>(state.get_int64("Elements{io}"));

Expand All @@ -54,7 +54,7 @@ void deterministic_sum(nvbench::state& state, nvbench::type_list<T>)
#endif // !TUNE_BASE
);
_CCCL_TRY_CUDA_API(
cub::DeviceReduce::Reduce, "Reduce failed", d_in, d_out, elements, cuda::std::plus<>{}, init_t{}, env);
cub::DeviceReduce::Reduce, "Reduce failed", d_in, d_out, elements, cuda::std::plus<>{}, init_value_t{}, env);
});
}

Expand Down
15 changes: 11 additions & 4 deletions cub/benchmarks/bench/reduce/nondeterministic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ struct policy_selector
template <typename T, typename OffsetT>
void nondeterministic_sum(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
using op_t = cuda::std::plus<>;
using init_t = T;
using op_t = cuda::std::plus<>;
using init_value_t = T;

// Retrieve axis parameters
const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));
Expand All @@ -63,11 +63,18 @@ void nondeterministic_sum(nvbench::state& state, nvbench::type_list<T, OffsetT>)
cuda::execution::require(cuda::execution::determinism::not_guaranteed)
#if !TUNE_BASE
,
cuda::execution::tune(policy_selector<cuda::std::__accumulator_t<op_t, T, init_t>>{})
cuda::execution::tune(policy_selector<cuda::std::__accumulator_t<op_t, T, init_value_t>>{})
#endif // !TUNE_BASE
);
_CCCL_TRY_CUDA_API(
cub::DeviceReduce::Reduce, "Reduce failed", d_in, d_out, static_cast<OffsetT>(elements), op_t{}, init_t{}, env);
cub::DeviceReduce::Reduce,
"Reduce failed",
d_in,
d_out,
static_cast<OffsetT>(elements),
op_t{},
init_value_t{},
env);
});
}

Expand Down
6 changes: 3 additions & 3 deletions cub/benchmarks/bench/scan/exclusive/base.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ template <typename T, typename OffsetT>
static void basic(nvbench::state& state, nvbench::type_list<T, OffsetT>)
try
{
using init_t = T;
using accum_t [[maybe_unused]] = ::cuda::std::__accumulator_t<op_t, init_t, T>;
using init_value_t = T;
using accum_t [[maybe_unused]] = ::cuda::std::__accumulator_t<op_t, init_value_t, T>;
using offset_t = cub::detail::choose_offset_t<OffsetT>;
#if USES_WARPSPEED()
static_assert(sizeof(offset_t) == sizeof(size_t)); // warpspeed scan uses size_t internally
Expand Down Expand Up @@ -55,7 +55,7 @@ try
d_input,
d_output,
op_t{},
init_t{},
init_value_t{},
static_cast<offset_t>(input.size()),
env);
});
Expand Down
8 changes: 4 additions & 4 deletions cub/benchmarks/bench/scan/exclusive/deterministic.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ template <typename T, typename OffsetT>
static void exclusive_scan(nvbench::state& state, nvbench::type_list<T, OffsetT>)
try
{
using init_t = T;
using offset_t = OffsetT;
using scan_op_t = ::cuda::std::plus<T>;
using init_value_t = T;
using offset_t = OffsetT;
using scan_op_t = ::cuda::std::plus<T>;

const auto elements = static_cast<std::size_t>(state.get_int64("Elements{io}"));

Expand All @@ -38,7 +38,7 @@ try
d_input,
d_output,
scan_op_t{},
init_t{},
init_value_t{},
static_cast<offset_t>(elements),
env);
});
Expand Down
8 changes: 4 additions & 4 deletions cub/benchmarks/bench/segmented_reduce/base.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,9 @@ void fixed_size_segmented_reduce(nvbench::state& state, nvbench::type_list<T>)
{
static constexpr bool is_argmin = std::is_same_v<op_t, cub::detail::arg_min>;

using output_t = cuda::std::conditional_t<is_argmin, cuda::std::pair<int, T>, T>;
using accum_t = output_t;
using init_t = cuda::std::conditional_t<is_argmin, cub::detail::reduce::empty_problem_init_t<accum_t>, T>;
using output_t = cuda::std::conditional_t<is_argmin, cuda::std::pair<int, T>, T>;
using accum_t = output_t;
using init_value_t = cuda::std::conditional_t<is_argmin, cub::detail::reduce::empty_problem_init_t<accum_t>, T>;

// Retrieve axis parameters
const size_t num_elements = static_cast<size_t>(state.get_int64("Elements{io}"));
Expand Down Expand Up @@ -96,7 +96,7 @@ void fixed_size_segmented_reduce(nvbench::state& state, nvbench::type_list<T>)
static_cast<::cuda::std::int64_t>(num_segments),
static_cast<int>(segment_size),
op_t{},
init_t{},
init_value_t{},
env);
}
});
Expand Down
6 changes: 3 additions & 3 deletions cub/benchmarks/bench/segmented_reduce/variable_base.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ void variable_segmented_reduce(nvbench::state& state, nvbench::type_list<T, Offs
using output_t = cuda::std::conditional_t<(is_argmin || is_argmax), cuda::std::pair<int, T>, T>;
using output_it_t = output_t*;
using accum_t = output_t;
using init_t =
using init_value_t =
cuda::std::conditional_t<(is_argmin || is_argmax), cub::detail::reduce::empty_problem_init_t<accum_t>, T>;
using offset_t = OffsetT;
using begin_offset_it_t = const offset_t*;
Expand Down Expand Up @@ -106,7 +106,7 @@ void variable_segmented_reduce(nvbench::state& state, nvbench::type_list<T, Offs
d_begin_offsets,
d_end_offsets,
op_t{},
init_t{},
init_value_t{},
guaranteed_max_seg_size,
nullptr /* stream */);

Expand All @@ -123,7 +123,7 @@ void variable_segmented_reduce(nvbench::state& state, nvbench::type_list<T, Offs
d_begin_offsets,
d_end_offsets,
op_t{},
init_t{},
init_value_t{},
guaranteed_max_seg_size,
launch.get_stream());
});
Expand Down
6 changes: 3 additions & 3 deletions cub/benchmarks/bench/transform_reduce/sum.cu
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ struct square_t
template <typename T, typename OffsetT>
void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
{
using init_t = T;
using init_value_t = T;
using reduction_op_t = ::cuda::std::plus<>;
using transform_op_t = square_t<T>;

Expand All @@ -62,7 +62,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
launch
#if !TUNE_BASE
,
cuda::execution::tune(policy_selector<cuda::std::__accumulator_t<reduction_op_t, T, init_t>>{})
cuda::execution::tune(policy_selector<cuda::std::__accumulator_t<reduction_op_t, T, init_value_t>>{})
#endif // !TUNE_BASE
);
_CCCL_TRY_CUDA_API(
Expand All @@ -73,7 +73,7 @@ void reduce(nvbench::state& state, nvbench::type_list<T, OffsetT>)
static_cast<OffsetT>(elements),
reduction_op_t{},
transform_op_t{},
init_t{},
init_value_t{},
env);
});
}
Expand Down
37 changes: 21 additions & 16 deletions cub/cub/device/device_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -250,11 +250,16 @@ private:
template <typename InputIteratorT,
typename OutputIteratorT,
typename ReductionOpT,
typename InitT,
typename InitValueT,
typename NumItemsT,
typename EnvT>
[[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t __minmax_reduce(
InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, ReductionOpT reduction_op, InitT init, EnvT env)
InputIteratorT d_in,
OutputIteratorT d_out,
NumItemsT num_items,
ReductionOpT reduction_op,
InitValueT init,
EnvT env)
{
static_assert(!::cuda::std::execution::__queryable_with<EnvT, ::cuda::execution::determinism::__get_determinism_t>,
"Determinism should be used inside requires to have an effect.");
Expand Down Expand Up @@ -662,7 +667,7 @@ public:
// The output value type
using OutputT = cub::detail::non_void_value_t<OutputIteratorT, cub::detail::it_value_t<InputIteratorT>>;

using InitT = OutputT;
using init_value_t = OutputT;

return detail::reduce::dispatch(
d_temp_storage,
Expand All @@ -671,7 +676,7 @@ public:
d_out,
static_cast<OffsetT>(num_items),
::cuda::std::plus<>{},
InitT{}, // zero-initialize
init_value_t{}, // zero-initialize
stream);
}

Expand Down Expand Up @@ -765,10 +770,10 @@ public:
{
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Min");

using OffsetT = detail::choose_offset_t<NumItemsT>; // Signed integer type for global offsets
using InputT = detail::it_value_t<InputIteratorT>;
using InitT = InputT;
using limits_t = ::cuda::std::numeric_limits<InitT>;
using OffsetT = detail::choose_offset_t<NumItemsT>; // Signed integer type for global offsets
using InputT = detail::it_value_t<InputIteratorT>;
using init_value_t = InputT;
using limits_t = ::cuda::std::numeric_limits<init_value_t>;
#ifndef CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX
static_assert(limits_t::is_specialized,
"cub::DeviceReduce::Min uses cuda::std::numeric_limits<InputIteratorT::value_type>::max() as initial "
Expand Down Expand Up @@ -1286,7 +1291,7 @@ public:

using AccumT = OutputTupleT;

using InitT = detail::reduce::empty_problem_init_t<AccumT>;
using init_value_t = detail::reduce::empty_problem_init_t<AccumT>;

// The output value type
using OutputValueT = typename OutputTupleT::Value;
Expand All @@ -1297,7 +1302,7 @@ public:
ArgIndexInputIteratorT d_indexed_in(d_in);

// Initial value
InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};
init_value_t initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::max())};

return detail::reduce::dispatch<AccumT>(
d_temp_storage,
Expand Down Expand Up @@ -1398,10 +1403,10 @@ public:
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Max");

// Signed integer type for global offsets
using OffsetT = detail::choose_offset_t<NumItemsT>;
using InputT = detail::it_value_t<InputIteratorT>;
using InitT = InputT;
using limits_t = ::cuda::std::numeric_limits<InitT>;
using OffsetT = detail::choose_offset_t<NumItemsT>;
using InputT = detail::it_value_t<InputIteratorT>;
using init_value_t = InputT;
using limits_t = ::cuda::std::numeric_limits<init_value_t>;
#ifndef CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX
static_assert(limits_t::is_specialized,
"cub::DeviceReduce::Max uses cuda::std::numeric_limits<InputIteratorT::value_type>::lowest() as "
Expand Down Expand Up @@ -1746,15 +1751,15 @@ public:
// The output value type
using OutputValueT = typename OutputTupleT::Value;

using InitT = detail::reduce::empty_problem_init_t<AccumT>;
using init_value_t = detail::reduce::empty_problem_init_t<AccumT>;

// Wrapped input iterator to produce index-value <OffsetT, InputT> tuples
using ArgIndexInputIteratorT = ArgIndexInputIterator<InputIteratorT, OffsetT, OutputValueT>;

ArgIndexInputIteratorT d_indexed_in(d_in);

// Initial value
InitT initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};
init_value_t initial_value{AccumT(1, ::cuda::std::numeric_limits<InputValueT>::lowest())};

return detail::reduce::dispatch<AccumT>(
d_temp_storage,
Expand Down
23 changes: 12 additions & 11 deletions cub/cub/device/device_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -329,19 +329,19 @@ struct DeviceScan
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSum");

// Unsigned integer type for global offsets
using OffsetT = detail::choose_offset_t<NumItemsT>;
using InitT = cub::detail::it_value_t<InputIteratorT>;
using OffsetT = detail::choose_offset_t<NumItemsT>;
using init_value_t = cub::detail::it_value_t<InputIteratorT>;

// Initial value
InitT init_value{};
init_value_t init_value{};

return detail::scan::dispatch(
d_temp_storage,
temp_storage_bytes,
d_in,
d_out,
::cuda::std::plus<>{},
detail::InputValue<InitT>(init_value),
detail::InputValue<init_value_t>(init_value),
static_cast<OffsetT>(num_items),
stream);
}
Expand Down Expand Up @@ -412,10 +412,11 @@ struct DeviceScan
{
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::ExclusiveSum");

using init_t = cub::detail::it_value_t<InputIteratorT>;
init_t init_value{};
using init_value_t = cub::detail::it_value_t<InputIteratorT>;
init_value_t init_value{};

return scan_impl_env(d_in, d_out, ::cuda::std::plus<>{}, detail::InputValue<init_t>(init_value), num_items, env);
return scan_impl_env(
d_in, d_out, ::cuda::std::plus<>{}, detail::InputValue<init_value_t>(init_value), num_items, env);
}

//! @rst
Expand Down Expand Up @@ -2415,8 +2416,8 @@ struct DeviceScan
cudaStream_t stream = nullptr)
{
_CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSumByKey");
using init_t = cub::detail::it_value_t<ValuesInputIteratorT>;
init_t init_value{};
using init_value_t = cub::detail::it_value_t<ValuesInputIteratorT>;
init_value_t init_value{};
return scan_by_key_impl<::cuda::std::execution::env<>>(
d_temp_storage,
temp_storage_bytes,
Expand Down Expand Up @@ -2983,7 +2984,7 @@ struct DeviceScan
{
_CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::ExclusiveSumByKey");

using init_t = cub::detail::it_value_t<ValuesInputIteratorT>;
using init_value_t = cub::detail::it_value_t<ValuesInputIteratorT>;
return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
using tuning_t = decltype(tuning);
return scan_by_key_impl<tuning_t>(
Expand All @@ -2994,7 +2995,7 @@ struct DeviceScan
d_values_out,
equality_op,
::cuda::std::plus<>{},
init_t{},
init_value_t{},
num_items,
stream);
});
Expand Down
Loading
Loading