From c06e2235791f10bc9de65ef95e49a9d3c6115975 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Thu, 4 Jun 2026 18:06:51 -0700 Subject: [PATCH 1/3] use uniform names for init_values throughout CUB --- cub/benchmarks/bench/reduce/base.cuh | 6 +- cub/benchmarks/bench/reduce/deterministic.cu | 4 +- .../bench/reduce/nondeterministic.cu | 6 +- cub/benchmarks/bench/scan/exclusive/base.cuh | 6 +- .../bench/scan/exclusive/deterministic.cu | 4 +- .../bench/segmented_reduce/base.cuh | 4 +- .../bench/segmented_reduce/variable_base.cuh | 6 +- cub/benchmarks/bench/transform_reduce/sum.cu | 6 +- cub/cub/device/device_reduce.cuh | 24 +++---- cub/cub/device/device_scan.cuh | 20 +++--- cub/cub/device/device_segmented_reduce.cuh | 70 +++++++++---------- cub/cub/device/dispatch/dispatch_reduce.cuh | 48 ++++++------- .../dispatch_reduce_deterministic.cuh | 26 +++---- .../dispatch_reduce_nondeterministic.cuh | 22 +++--- .../dispatch/dispatch_segmented_reduce.cuh | 46 ++++++------ .../dispatch/dispatch_streaming_reduce.cuh | 2 +- .../device/dispatch/kernels/kernel_reduce.cuh | 28 ++++---- .../kernels/kernel_reduce_deterministic.cuh | 6 +- .../kernels/kernel_segmented_reduce.cuh | 12 ++-- cub/test/catch2_test_device_reduce.cu | 4 +- cub/test/catch2_test_device_reduce.cuh | 4 +- ...catch2_test_device_reduce_deterministic.cu | 18 ++--- .../catch2_test_device_reduce_dispatcher.cu | 8 +-- cub/test/catch2_test_device_reduce_env.cu | 46 ++++++------ .../catch2_test_device_reduce_iterators.cu | 10 +-- ...ch2_test_device_reduce_nondeterministic.cu | 6 +- cub/test/catch2_test_device_scan.cu | 14 ++-- cub/test/catch2_test_device_scan.cuh | 22 +++--- cub/test/catch2_test_device_scan_by_key.cu | 10 +-- ...atch2_test_device_scan_by_key_iterators.cu | 4 +- cub/test/catch2_test_device_scan_env.cu | 4 +- cub/test/catch2_test_device_scan_invalid.cu | 6 +- cub/test/catch2_test_device_scan_iterators.cu | 8 +-- .../catch2_test_device_segmented_reduce.cu | 8 +-- ..._test_device_segmented_reduce_iterators.cu | 6 +- ...st_device_segmented_reduce_max_seg_size.cu | 6 +- .../catch2_test_device_transform_reduce.cu | 14 ++-- 37 files changed, 272 insertions(+), 272 deletions(-) diff --git a/cub/benchmarks/bench/reduce/base.cuh b/cub/benchmarks/bench/reduce/base.cuh index 322674e69f7..f6289215a53 100644 --- a/cub/benchmarks/bench/reduce/base.cuh +++ b/cub/benchmarks/bench/reduce/base.cuh @@ -26,7 +26,7 @@ struct policy_selector template void reduce(nvbench::state& state, nvbench::type_list) { - using init_t = T; + using init_value_t = T; // Retrieve axis parameters const auto elements = state.get_int64("Elements{io}"); @@ -49,11 +49,11 @@ void reduce(nvbench::state& state, nvbench::type_list) launch #if !TUNE_BASE , - cuda::execution::tune(policy_selector>{}) + cuda::execution::tune(policy_selector>{}) #endif // !TUNE_BASE ); _CCCL_TRY_CUDA_API( - cub::DeviceReduce::Reduce, "Reduce failed", d_in, d_out, static_cast(elements), op_t{}, init_t{}, env); + cub::DeviceReduce::Reduce, "Reduce failed", d_in, d_out, static_cast(elements), op_t{}, init_value_t{}, env); }); } diff --git a/cub/benchmarks/bench/reduce/deterministic.cu b/cub/benchmarks/bench/reduce/deterministic.cu index ea3f79eb8c5..8739a7016ee 100644 --- a/cub/benchmarks/bench/reduce/deterministic.cu +++ b/cub/benchmarks/bench/reduce/deterministic.cu @@ -29,7 +29,7 @@ struct policy_selector_t template void deterministic_sum(nvbench::state& state, nvbench::type_list) { - using init_t = T; + using init_value_t = T; const auto elements = static_cast(state.get_int64("Elements{io}")); @@ -54,7 +54,7 @@ void deterministic_sum(nvbench::state& state, nvbench::type_list) #endif // !TUNE_BASE ); _CCCL_TRY_CUDA_API( - cub::DeviceReduce::Reduce, "Reduce failed", d_in, d_out, elements, cuda::std::plus<>{}, init_t{}, env); + cub::DeviceReduce::Reduce, "Reduce failed", d_in, d_out, elements, cuda::std::plus<>{}, init_value_t{}, env); }); } diff --git a/cub/benchmarks/bench/reduce/nondeterministic.cu b/cub/benchmarks/bench/reduce/nondeterministic.cu index b4bd551c79c..dd2e044e458 100644 --- a/cub/benchmarks/bench/reduce/nondeterministic.cu +++ b/cub/benchmarks/bench/reduce/nondeterministic.cu @@ -39,7 +39,7 @@ template void nondeterministic_sum(nvbench::state& state, nvbench::type_list) { using op_t = cuda::std::plus<>; - using init_t = T; + using init_value_t = T; // Retrieve axis parameters const auto elements = static_cast(state.get_int64("Elements{io}")); @@ -63,11 +63,11 @@ void nondeterministic_sum(nvbench::state& state, nvbench::type_list) cuda::execution::require(cuda::execution::determinism::not_guaranteed) #if !TUNE_BASE , - cuda::execution::tune(policy_selector>{}) + cuda::execution::tune(policy_selector>{}) #endif // !TUNE_BASE ); _CCCL_TRY_CUDA_API( - cub::DeviceReduce::Reduce, "Reduce failed", d_in, d_out, static_cast(elements), op_t{}, init_t{}, env); + cub::DeviceReduce::Reduce, "Reduce failed", d_in, d_out, static_cast(elements), op_t{}, init_value_t{}, env); }); } diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh index 74ef8fa9fb5..5c6931485e6 100644 --- a/cub/benchmarks/bench/scan/exclusive/base.cuh +++ b/cub/benchmarks/bench/scan/exclusive/base.cuh @@ -15,8 +15,8 @@ template static void basic(nvbench::state& state, nvbench::type_list) try { - using init_t = T; - using accum_t [[maybe_unused]] = ::cuda::std::__accumulator_t; + using init_value_t = T; + using accum_t [[maybe_unused]] = ::cuda::std::__accumulator_t; using offset_t = cub::detail::choose_offset_t; #if USES_WARPSPEED() static_assert(sizeof(offset_t) == sizeof(size_t)); // warpspeed scan uses size_t internally @@ -55,7 +55,7 @@ try d_input, d_output, op_t{}, - init_t{}, + init_value_t{}, static_cast(input.size()), env); }); diff --git a/cub/benchmarks/bench/scan/exclusive/deterministic.cu b/cub/benchmarks/bench/scan/exclusive/deterministic.cu index 2b71545bc38..95980eb9af0 100644 --- a/cub/benchmarks/bench/scan/exclusive/deterministic.cu +++ b/cub/benchmarks/bench/scan/exclusive/deterministic.cu @@ -13,7 +13,7 @@ template static void exclusive_scan(nvbench::state& state, nvbench::type_list) try { - using init_t = T; + using init_value_t = T; using offset_t = OffsetT; using scan_op_t = ::cuda::std::plus; @@ -38,7 +38,7 @@ try d_input, d_output, scan_op_t{}, - init_t{}, + init_value_t{}, static_cast(elements), env); }); diff --git a/cub/benchmarks/bench/segmented_reduce/base.cuh b/cub/benchmarks/bench/segmented_reduce/base.cuh index bbac000f6b5..a7550fb3063 100644 --- a/cub/benchmarks/bench/segmented_reduce/base.cuh +++ b/cub/benchmarks/bench/segmented_reduce/base.cuh @@ -46,7 +46,7 @@ void fixed_size_segmented_reduce(nvbench::state& state, nvbench::type_list) using output_t = cuda::std::conditional_t, T>; using accum_t = output_t; - using init_t = cuda::std::conditional_t, T>; + using init_value_t = cuda::std::conditional_t, T>; // Retrieve axis parameters const size_t num_elements = static_cast(state.get_int64("Elements{io}")); @@ -96,7 +96,7 @@ void fixed_size_segmented_reduce(nvbench::state& state, nvbench::type_list) static_cast<::cuda::std::int64_t>(num_segments), static_cast(segment_size), op_t{}, - init_t{}, + init_value_t{}, env); } }); diff --git a/cub/benchmarks/bench/segmented_reduce/variable_base.cuh b/cub/benchmarks/bench/segmented_reduce/variable_base.cuh index f3c2fea0ac1..cda9a599393 100644 --- a/cub/benchmarks/bench/segmented_reduce/variable_base.cuh +++ b/cub/benchmarks/bench/segmented_reduce/variable_base.cuh @@ -32,7 +32,7 @@ void variable_segmented_reduce(nvbench::state& state, nvbench::type_list, T>; using output_it_t = output_t*; using accum_t = output_t; - using init_t = + using init_value_t = cuda::std::conditional_t<(is_argmin || is_argmax), cub::detail::reduce::empty_problem_init_t, T>; using offset_t = OffsetT; using begin_offset_it_t = const offset_t*; @@ -106,7 +106,7 @@ void variable_segmented_reduce(nvbench::state& state, nvbench::type_list void reduce(nvbench::state& state, nvbench::type_list) { - using init_t = T; + using init_value_t = T; using reduction_op_t = ::cuda::std::plus<>; using transform_op_t = square_t; @@ -62,7 +62,7 @@ void reduce(nvbench::state& state, nvbench::type_list) launch #if !TUNE_BASE , - cuda::execution::tune(policy_selector>{}) + cuda::execution::tune(policy_selector>{}) #endif // !TUNE_BASE ); _CCCL_TRY_CUDA_API( @@ -73,7 +73,7 @@ void reduce(nvbench::state& state, nvbench::type_list) static_cast(elements), reduction_op_t{}, transform_op_t{}, - init_t{}, + init_value_t{}, env); }); } diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh index 0c0f828eab1..99a0a22a229 100644 --- a/cub/cub/device/device_reduce.cuh +++ b/cub/cub/device/device_reduce.cuh @@ -250,11 +250,11 @@ private: template [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t __minmax_reduce( - InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, ReductionOpT reduction_op, InitT init, EnvT env) + InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, ReductionOpT reduction_op, InitValueT init, EnvT env) { static_assert(!::cuda::std::execution::__queryable_with, "Determinism should be used inside requires to have an effect."); @@ -662,7 +662,7 @@ public: // The output value type using OutputT = cub::detail::non_void_value_t>; - using InitT = OutputT; + using init_value_t = OutputT; return detail::reduce::dispatch( d_temp_storage, @@ -671,7 +671,7 @@ public: d_out, static_cast(num_items), ::cuda::std::plus<>{}, - InitT{}, // zero-initialize + init_value_t{}, // zero-initialize stream); } @@ -767,8 +767,8 @@ public: using OffsetT = detail::choose_offset_t; // Signed integer type for global offsets using InputT = detail::it_value_t; - using InitT = InputT; - using limits_t = ::cuda::std::numeric_limits; + using init_value_t = InputT; + using limits_t = ::cuda::std::numeric_limits; #ifndef CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX static_assert(limits_t::is_specialized, "cub::DeviceReduce::Min uses cuda::std::numeric_limits::max() as initial " @@ -1286,7 +1286,7 @@ public: using AccumT = OutputTupleT; - using InitT = detail::reduce::empty_problem_init_t; + using init_value_t = detail::reduce::empty_problem_init_t; // The output value type using OutputValueT = typename OutputTupleT::Value; @@ -1297,7 +1297,7 @@ public: ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value - InitT initial_value{AccumT(1, ::cuda::std::numeric_limits::max())}; + init_value_t initial_value{AccumT(1, ::cuda::std::numeric_limits::max())}; return detail::reduce::dispatch( d_temp_storage, @@ -1400,8 +1400,8 @@ public: // Signed integer type for global offsets using OffsetT = detail::choose_offset_t; using InputT = detail::it_value_t; - using InitT = InputT; - using limits_t = ::cuda::std::numeric_limits; + using init_value_t = InputT; + using limits_t = ::cuda::std::numeric_limits; #ifndef CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX static_assert(limits_t::is_specialized, "cub::DeviceReduce::Max uses cuda::std::numeric_limits::lowest() as " @@ -1746,7 +1746,7 @@ public: // The output value type using OutputValueT = typename OutputTupleT::Value; - using InitT = detail::reduce::empty_problem_init_t; + using init_value_t = detail::reduce::empty_problem_init_t; // Wrapped input iterator to produce index-value tuples using ArgIndexInputIteratorT = ArgIndexInputIterator; @@ -1754,7 +1754,7 @@ public: ArgIndexInputIteratorT d_indexed_in(d_in); // Initial value - InitT initial_value{AccumT(1, ::cuda::std::numeric_limits::lowest())}; + init_value_t initial_value{AccumT(1, ::cuda::std::numeric_limits::lowest())}; return detail::reduce::dispatch( d_temp_storage, diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index df11e8a097a..74fec0a43bd 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -330,10 +330,10 @@ struct DeviceScan // Unsigned integer type for global offsets using OffsetT = detail::choose_offset_t; - using InitT = cub::detail::it_value_t; + using init_value_t = cub::detail::it_value_t; // Initial value - InitT init_value{}; + init_value_t init_value{}; return detail::scan::dispatch( d_temp_storage, @@ -341,7 +341,7 @@ struct DeviceScan d_in, d_out, ::cuda::std::plus<>{}, - detail::InputValue(init_value), + detail::InputValue(init_value), static_cast(num_items), stream); } @@ -412,10 +412,10 @@ struct DeviceScan { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::ExclusiveSum"); - using init_t = cub::detail::it_value_t; - init_t init_value{}; + using init_value_t = cub::detail::it_value_t; + init_value_t init_value{}; - return scan_impl_env(d_in, d_out, ::cuda::std::plus<>{}, detail::InputValue(init_value), num_items, env); + return scan_impl_env(d_in, d_out, ::cuda::std::plus<>{}, detail::InputValue(init_value), num_items, env); } //! @rst @@ -2415,8 +2415,8 @@ struct DeviceScan cudaStream_t stream = nullptr) { _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSumByKey"); - using init_t = cub::detail::it_value_t; - init_t init_value{}; + using init_value_t = cub::detail::it_value_t; + init_value_t init_value{}; return scan_by_key_impl<::cuda::std::execution::env<>>( d_temp_storage, temp_storage_bytes, @@ -2983,7 +2983,7 @@ struct DeviceScan { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceScan::ExclusiveSumByKey"); - using init_t = cub::detail::it_value_t; + using init_value_t = cub::detail::it_value_t; return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) { using tuning_t = decltype(tuning); return scan_by_key_impl( @@ -2994,7 +2994,7 @@ struct DeviceScan d_values_out, equality_op, ::cuda::std::plus<>{}, - init_t{}, + init_value_t{}, num_items, stream); }); diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh index d340d8cadd6..c7b1dc97b29 100644 --- a/cub/cub/device/device_segmented_reduce.cuh +++ b/cub/cub/device/device_segmented_reduce.cuh @@ -93,7 +93,7 @@ private: using input_value_t = cub::detail::it_value_t; using output_tuple_t = cub::detail::non_void_value_t>; using accum_t = output_tuple_t; - using init_t = detail::reduce::empty_problem_init_t; + using init_value_t = detail::reduce::empty_problem_init_t; using output_key_t = typename output_tuple_t::first_type; using output_value_t = typename output_tuple_t::second_type; @@ -109,7 +109,7 @@ private: constexpr bool is_min = ::cuda::std::is_same_v; auto sentinel = is_min ? ::cuda::std::numeric_limits::max() : ::cuda::std::numeric_limits::lowest(); - init_t initial_value{accum_t(1, sentinel)}; + init_value_t initial_value{accum_t(1, sentinel)}; using default_policy_selector_t = detail::segmented_reduce::policy_selector_from_types; @@ -157,7 +157,7 @@ private: typename BeginOffsetIteratorT, typename EndOffsetIteratorT, typename ReductionOpT, - typename InitT, + typename InitValueT, typename EnvT> CUB_RUNTIME_FUNCTION static cudaError_t variable_size_env_impl( InputIteratorT d_in, @@ -166,7 +166,7 @@ private: BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, ReductionOpT reduction_op, - InitT initial_value, + InitValueT initial_value, EnvT env) { using requirements_t = ::cuda::std::execution:: @@ -242,7 +242,7 @@ private: typename InputIteratorT, typename OutputIteratorT, typename ReductionOpT, - typename InitT, + typename InitValueT, typename EnvT> CUB_RUNTIME_FUNCTION static cudaError_t fixed_size_env_impl( InputIteratorT d_in, @@ -250,7 +250,7 @@ private: ::cuda::std::int64_t num_segments, OffsetT segment_size, ReductionOpT reduction_op, - InitT initial_value, + InitValueT initial_value, EnvT env) { using requirements_t = ::cuda::std::execution:: @@ -832,7 +832,7 @@ public: using OffsetT = detail::common_iterator_value_t; using OutputT = detail::non_void_value_t>; - using init_t = OutputT; + using init_value_t = OutputT; static_assert(::cuda::std::is_integral_v, "Offset iterator value type should be integral."); if constexpr (::cuda::std::is_integral_v) { @@ -845,7 +845,7 @@ public: d_begin_offsets, d_end_offsets, ::cuda::std::plus<>{}, - init_t{}, // zero-initialize + init_value_t{}, // zero-initialize 0, // max_segment_size stream); } @@ -944,12 +944,12 @@ public: using OffsetT = detail::common_iterator_value_t; using OutputT = detail::non_void_value_t>; - using init_t = OutputT; + using init_value_t = OutputT; using op_t = ::cuda::std::plus<>; - using AccumT = ::cuda::std::__accumulator_t, init_t>; + using AccumT = ::cuda::std::__accumulator_t, init_value_t>; return variable_size_env_impl( - d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, op_t{}, init_t{}, env); + d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, op_t{}, init_value_t{}, env); } //! @rst @@ -1016,9 +1016,9 @@ public: static_assert(!::cuda::std::is_same_v, "InputIteratorT must be a real iterator; void* has no iterator_traits::value_type."); _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Sum"); - using init_t = detail::non_void_value_t>; + using init_value_t = detail::non_void_value_t>; return fixed_size_impl( - d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, segment_size, ::cuda::std::plus{}, init_t{}, stream); + d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, segment_size, ::cuda::std::plus{}, init_value_t{}, stream); } //! @rst @@ -1194,8 +1194,8 @@ public: using OffsetT = detail::common_iterator_value_t; using InputT = detail::it_value_t; - using init_t = InputT; - static_assert(::cuda::std::numeric_limits::is_specialized, + using init_value_t = InputT; + static_assert(::cuda::std::numeric_limits::is_specialized, "numeric_limits must be specialized for the input value type"); static_assert(::cuda::std::is_integral_v, "Offset iterator value type should be integral."); if constexpr (::cuda::std::is_integral_v) @@ -1209,7 +1209,7 @@ public: d_begin_offsets, d_end_offsets, ::cuda::minimum<>{}, - ::cuda::std::numeric_limits::max(), + ::cuda::std::numeric_limits::max(), 0, // max_segment_size stream); } @@ -1308,15 +1308,15 @@ public: using OffsetT = detail::common_iterator_value_t; using InputT = detail::it_value_t; - using init_t = InputT; + using init_value_t = InputT; using op_t = ::cuda::minimum<>; - using AccumT = ::cuda::std::__accumulator_t, init_t>; + using AccumT = ::cuda::std::__accumulator_t, init_value_t>; - static_assert(::cuda::std::numeric_limits::is_specialized, + static_assert(::cuda::std::numeric_limits::is_specialized, "numeric_limits must be specialized for the input value type"); return variable_size_env_impl( - d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, op_t{}, ::cuda::std::numeric_limits::max(), env); + d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, op_t{}, ::cuda::std::numeric_limits::max(), env); } //! @rst @@ -1594,7 +1594,7 @@ public: using OutputKeyT = typename OutputTupleT::Key; using OutputValueT = typename OutputTupleT::Value; using OverrideAccumT = OutputTupleT; - using InitT = detail::reduce::empty_problem_init_t; + using init_value_t = detail::reduce::empty_problem_init_t; static_assert(::cuda::std::is_same_v, "Output key type must be int."); static_assert(::cuda::std::numeric_limits::is_specialized, @@ -1604,7 +1604,7 @@ public: using ArgIndexInputIteratorT = ArgIndexInputIterator; ArgIndexInputIteratorT d_indexed_in(d_in); - InitT initial_value{OverrideAccumT(1, ::cuda::std::numeric_limits::max())}; + init_value_t initial_value{OverrideAccumT(1, ::cuda::std::numeric_limits::max())}; static_assert(::cuda::std::is_integral_v, "Offset iterator value type should be integral."); if constexpr (::cuda::std::is_integral_v) @@ -1732,7 +1732,7 @@ public: using OutputKeyT = typename OutputTupleT::Key; using OutputValueT = typename OutputTupleT::Value; using OverrideAccumT = OutputTupleT; - using InitT = detail::reduce::empty_problem_init_t; + using init_value_t = detail::reduce::empty_problem_init_t; static_assert(::cuda::std::is_same_v, "Output key type must be int."); static_assert(::cuda::std::numeric_limits::is_specialized, @@ -1742,7 +1742,7 @@ public: using ArgIndexInputIteratorT = ArgIndexInputIterator; ArgIndexInputIteratorT d_indexed_in(d_in); - InitT initial_value{OverrideAccumT(1, ::cuda::std::numeric_limits::max())}; + init_value_t initial_value{OverrideAccumT(1, ::cuda::std::numeric_limits::max())}; return variable_size_env_impl( d_indexed_in, d_out, num_segments, d_begin_offsets, d_end_offsets, cub::ArgMin{}, initial_value, env); @@ -1980,9 +1980,9 @@ public: using OffsetT = detail::common_iterator_value_t; using InputT = cub::detail::it_value_t; - using init_t = InputT; + using init_value_t = InputT; - static_assert(::cuda::std::numeric_limits::is_specialized, + static_assert(::cuda::std::numeric_limits::is_specialized, "numeric_limits must be specialized for the input value type"); static_assert(::cuda::std::is_integral_v, "Offset iterator value type should be integral."); if constexpr (::cuda::std::is_integral_v) @@ -1996,7 +1996,7 @@ public: d_begin_offsets, d_end_offsets, ::cuda::maximum<>{}, - ::cuda::std::numeric_limits::lowest(), + ::cuda::std::numeric_limits::lowest(), 0, // max_segment_size stream); } @@ -2095,11 +2095,11 @@ public: using OffsetT = detail::common_iterator_value_t; using InputT = cub::detail::it_value_t; - using init_t = InputT; + using init_value_t = InputT; using op_t = ::cuda::maximum<>; - using AccumT = ::cuda::std::__accumulator_t, init_t>; + using AccumT = ::cuda::std::__accumulator_t, init_value_t>; - static_assert(::cuda::std::numeric_limits::is_specialized, + static_assert(::cuda::std::numeric_limits::is_specialized, "numeric_limits must be specialized for the input value type"); return variable_size_env_impl( @@ -2109,7 +2109,7 @@ public: d_begin_offsets, d_end_offsets, op_t{}, - ::cuda::std::numeric_limits::lowest(), + ::cuda::std::numeric_limits::lowest(), env); } @@ -2387,7 +2387,7 @@ public: using InputValueT = cub::detail::it_value_t; using OutputTupleT = cub::detail::non_void_value_t>; using OverrideAccumT = OutputTupleT; - using InitT = detail::reduce::empty_problem_init_t; + using init_value_t = detail::reduce::empty_problem_init_t; using OutputKeyT = typename OutputTupleT::Key; using OutputValueT = typename OutputTupleT::Value; @@ -2399,7 +2399,7 @@ public: using ArgIndexInputIteratorT = ArgIndexInputIterator; ArgIndexInputIteratorT d_indexed_in(d_in); - InitT initial_value{OverrideAccumT(1, ::cuda::std::numeric_limits::lowest())}; + init_value_t initial_value{OverrideAccumT(1, ::cuda::std::numeric_limits::lowest())}; static_assert(::cuda::std::is_integral_v, "Offset iterator value type should be integral."); if constexpr (::cuda::std::is_integral_v) @@ -2525,7 +2525,7 @@ public: using InputValueT = cub::detail::it_value_t; using OutputTupleT = cub::detail::non_void_value_t>; using OverrideAccumT = OutputTupleT; - using InitT = detail::reduce::empty_problem_init_t; + using init_value_t = detail::reduce::empty_problem_init_t; using OutputKeyT = typename OutputTupleT::Key; using OutputValueT = typename OutputTupleT::Value; @@ -2537,7 +2537,7 @@ public: using ArgIndexInputIteratorT = ArgIndexInputIterator; ArgIndexInputIteratorT d_indexed_in(d_in); - InitT initial_value{OverrideAccumT(1, ::cuda::std::numeric_limits::lowest())}; + init_value_t initial_value{OverrideAccumT(1, ::cuda::std::numeric_limits::lowest())}; return variable_size_env_impl( d_indexed_in, d_out, num_segments, d_begin_offsets, d_end_offsets, cub::ArgMax{}, initial_value, env); diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index 9b71e37a569..fb685b2c355 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -48,7 +48,7 @@ template struct DeviceReduceKernelSource @@ -63,7 +63,7 @@ struct DeviceReduceKernelSource OutputIteratorT, OffsetT, ReductionOpT, - InitT, + InitValueT, AccumT, TransformOpT>) @@ -77,7 +77,7 @@ struct DeviceReduceKernelSource OutputIteratorT, int, // Always used with int offsets ReductionOpT, - InitT, + InitValueT, AccumT>) CUB_RUNTIME_FUNCTION static constexpr size_t AccumSize() @@ -137,15 +137,15 @@ struct policy_selector_from_hub * Binary reduction functor type having member * `auto operator()(const T &a, const U &b)` * - * @tparam InitT + * @tparam InitValueT * Initial value type */ template >, - typename AccumT = ::cuda::std::__accumulator_t, InitT>, + typename InitValueT = cub::detail::non_void_value_t>, + typename AccumT = ::cuda::std::__accumulator_t, InitValueT>, typename TransformOpT = ::cuda::std::identity, typename PolicyHub = detail::reduce::policy_hub, typename KernelSource = detail::reduce::DeviceReduceKernelSource< @@ -154,7 +154,7 @@ template , typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> @@ -185,7 +185,7 @@ struct DispatchReduce ReductionOpT reduction_op; /// The initial value of the reduction - InitT init; + InitValueT init; /// CUDA stream to launch kernels within. Default is stream0. cudaStream_t stream; @@ -210,7 +210,7 @@ struct DispatchReduce OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, - InitT init, + InitValueT init, cudaStream_t stream, int ptx_version, TransformOpT transform_op = {}, @@ -471,7 +471,7 @@ struct DispatchReduce OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, - InitT init, + InitValueT init, cudaStream_t stream, TransformOpT transform_op = {}, KernelSource kernel_source = {}, @@ -532,7 +532,7 @@ struct DispatchReduce * Unary transform functor type having member * `auto operator()(const T &a)` * - * @tparam InitT + * @tparam InitValueT * Initial value type */ template < @@ -541,11 +541,11 @@ template < typename OffsetT, typename ReductionOpT, typename TransformOpT, - typename InitT, + typename InitValueT, typename AccumT = ::cuda::std::__accumulator_t>, - InitT>, + InitValueT>, typename PolicyHub = detail::reduce::policy_hub, typename KernelSource = detail::reduce::DeviceReduceKernelSource< typename PolicyHub::MaxPolicy, @@ -553,7 +553,7 @@ template < OutputIteratorT, OffsetT, ReductionOpT, - InitT, + InitValueT, AccumT, TransformOpT>, typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> @@ -562,7 +562,7 @@ using DispatchTransformReduce = OutputIteratorT, OffsetT, ReductionOpT, - InitT, + InitValueT, AccumT, TransformOpT, PolicyHub, @@ -576,7 +576,7 @@ template @@ -587,7 +587,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_passes( OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, - InitT init, + InitValueT init, cudaStream_t stream, TransformOpT transform_op, reduce_policy active_policy, @@ -706,13 +706,13 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t invoke_passes( // select the accumulator type using an overload set, so __accumulator_t and invoke_result_t are not instantiated when // an overriding accumulator type is present. This is needed by CCCL.C. -template +template _CCCL_HOST_DEVICE_API auto select_accum_t(use_default*) -> ::cuda::std::__accumulator_t>, - InitT>; + InitValueT>; template >, + typename InitValueT = non_void_value_t>, typename TransformOpT = ::cuda::std::identity, typename AccumT = - decltype(select_accum_t(static_cast(nullptr))), + decltype(select_accum_t(static_cast(nullptr))), typename PolicySelector = policy_selector_from_types, typename KernelSource = - DeviceReduceKernelSource, + DeviceReduceKernelSource, typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> #if _CCCL_HAS_CONCEPTS() requires reduce_policy_selector @@ -743,7 +743,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, - InitT init, + InitValueT init, cudaStream_t stream, TransformOpT transform_op = {}, PolicySelector policy_selector = {}, diff --git a/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh b/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh index 4b54e9d53bc..3d43f5e7ae2 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh @@ -45,9 +45,9 @@ namespace detail::rfa template using transformed_input_t = ::cuda::std::decay_t<::cuda::std::invoke_result_t>; -template +template using accum_t = - ::cuda::std::__accumulator_t<::cuda::std::plus<>, InitT, transformed_input_t>>; + ::cuda::std::__accumulator_t<::cuda::std::plus<>, InitValueT, transformed_input_t>>; template >* = nullptr> struct deterministic_sum_t @@ -83,7 +83,7 @@ template @@ -94,7 +94,7 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t invok OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, - InitT init, + InitValueT init, cudaStream_t stream, TransformOpT transform_op, rfa_policy active_policy, @@ -124,7 +124,7 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t invok InputIteratorT, OutputIteratorT, ReductionOpT, - InitT, + InitValueT, DeterministicAccumT, TransformOpT>, d_in, @@ -152,7 +152,7 @@ template @@ -163,7 +163,7 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t invok OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, - InitT init, + InitValueT init, cudaStream_t stream, TransformOpT transform_op, rfa_policy active_policy, @@ -297,7 +297,7 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t invok DeterministicAccumT*, OutputIteratorT, ReductionOpT, - InitT, + InitValueT, DeterministicAccumT>, d_block_reductions, d_out, @@ -322,9 +322,9 @@ CUB_RUNTIME_FUNCTION _CCCL_VISIBILITY_HIDDEN _CCCL_FORCEINLINE cudaError_t invok template , + typename AccumT = accum_t, typename PolicySelector = policy_selector_from_types, typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( @@ -333,7 +333,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, - InitT init = {}, + InitValueT init = {}, cudaStream_t stream = {}, TransformOpT transform_op = {}, PolicySelector policy_selector = {}, @@ -374,7 +374,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( OutputIteratorT, OffsetT, deterministic_add_t, - InitT, + InitValueT, typename deterministic_add_t::DeterministicAcc, TransformOpT>( d_temp_storage, @@ -395,7 +395,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( OutputIteratorT, OffsetT, deterministic_add_t, - InitT, + InitValueT, typename deterministic_add_t::DeterministicAcc, TransformOpT>( d_temp_storage, diff --git a/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh b/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh index b57bf088072..8c890f95edb 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh @@ -41,7 +41,7 @@ template struct DeviceReduceNondeterministicKernelSource @@ -55,12 +55,12 @@ struct DeviceReduceNondeterministicKernelSource OffsetT, ReductionOpT, AccumT, - InitT, + InitValueT, TransformOpT>); CUB_RUNTIME_FUNCTION static constexpr size_t InitSize() { - return sizeof(InitT); + return sizeof(InitValueT); } }; @@ -78,14 +78,14 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE void* get_device_ptr(void* ptr) struct nondeterministic_no_override {}; -template +template _CCCL_HOST_DEVICE_API auto select_nondeterministic_accum_t(nondeterministic_no_override*) -> ::cuda::std::__accumulator_t>, - InitT>; + InitValueT>; template O //! @tparam ReductionOpT //! Binary reduction functor type having member `auto operator()(const T &a, const U &b)` //! -//! @tparam InitT +//! @tparam InitValueT //! Initial value type //! //! @param[in] d_temp_storage @@ -141,9 +141,9 @@ template >, + typename InitValueT = non_void_value_t>, typename TransformOpT = ::cuda::std::identity, - typename AccumT = decltype(select_nondeterministic_accum_t( + typename AccumT = decltype(select_nondeterministic_accum_t( static_cast(nullptr))), typename PolicySelector = policy_selector_from_types, typename KernelSource = DeviceReduceNondeterministicKernelSource< @@ -152,7 +152,7 @@ template , typename KernelLauncherFactory = TripleChevronFactory> @@ -166,7 +166,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( OutputIteratorT d_out, OffsetT num_items, ReductionOpT reduction_op, - InitT init, + InitValueT init, cudaStream_t stream, TransformOpT transform_op = {}, PolicySelector policy_selector = {}, diff --git a/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh b/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh index 63ffd1c7a7a..c4304d27d77 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh @@ -44,7 +44,7 @@ template struct DeviceSegmentedReduceKernelSource { @@ -61,7 +61,7 @@ struct DeviceSegmentedReduceKernelSource EndOffsetIteratorT, OffsetT, ReductionOpT, - InitT, + InitValueT, AccumT>) }; @@ -135,7 +135,7 @@ public: * Binary reduction functor type having member * `auto operator()(const T &a, const U &b)` * - * @tparam InitT + * @tparam InitValueT * value type */ template >, - typename AccumT = ::cuda::std::__accumulator_t, InitT>, + typename InitValueT = cub::detail::non_void_value_t>, + typename AccumT = ::cuda::std::__accumulator_t, InitValueT>, typename PolicyHub = detail::segmented_reduce::policy_hub, typename KernelSource = detail::segmented_reduce::DeviceSegmentedReduceKernelSource< detail::segmented_reduce::policy_selector_from_hub, @@ -155,7 +155,7 @@ template , typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> struct DispatchSegmentedReduce @@ -198,7 +198,7 @@ struct DispatchSegmentedReduce ReductionOpT reduction_op; /// The initial value of the reduction - InitT init; + InitValueT init; /// CUDA stream to launch kernels within. Default is stream0. cudaStream_t stream; @@ -224,7 +224,7 @@ struct DispatchSegmentedReduce BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, ReductionOpT reduction_op, - InitT init, + InitValueT init, cudaStream_t stream, int ptx_version, KernelSource kernel_source = {}, @@ -415,7 +415,7 @@ struct DispatchSegmentedReduce BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, ReductionOpT reduction_op, - InitT init, + InitValueT init, cudaStream_t stream, KernelSource kernel_source = {}, KernelLauncherFactory launcher_factory = {}, @@ -470,12 +470,12 @@ namespace detail::segmented_reduce { // select the accumulator type using an overload set, so __accumulator_t is not instantiated when // an overriding accumulator type is present. This is needed by CCCL.C. -template +template _CCCL_HOST_DEVICE_API auto select_segmented_accum_t(use_default*) - -> ::cuda::std::__accumulator_t, InitT>; + -> ::cuda::std::__accumulator_t, InitValueT>; template , int> = 0> @@ -493,9 +493,9 @@ template < common_iterator_value, ::cuda::std::type_identity>::type, typename ReductionOpT, - typename InitT = non_void_value_t>, + typename InitValueT = non_void_value_t>, typename AccumT = - decltype(select_segmented_accum_t(static_cast(nullptr))), + decltype(select_segmented_accum_t(static_cast(nullptr))), typename PolicySelector = policy_selector_from_types, typename KernelSource = DeviceSegmentedReduceKernelSource< PolicySelector, @@ -505,7 +505,7 @@ template < EndOffsetIteratorT, OffsetT, ReductionOpT, - InitT, + InitValueT, AccumT>, typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> #if _CCCL_HAS_CONCEPTS() @@ -520,7 +520,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch( BeginOffsetIteratorT d_begin_offsets, EndOffsetIteratorT d_end_offsets, ReductionOpT reduction_op, - InitT init, + InitValueT init, size_t max_segment_size, cudaStream_t stream, PolicySelector policy_selector = {}, @@ -668,7 +668,7 @@ template struct DeviceFixedSizeSegmentedReduceKernelSource { @@ -677,11 +677,11 @@ struct DeviceFixedSizeSegmentedReduceKernelSource CUB_DEFINE_KERNEL_GETTER( FixedSizeSegmentedReduceKernel, - DeviceFixedSizeSegmentedReduceKernel) + DeviceFixedSizeSegmentedReduceKernel) CUB_DEFINE_KERNEL_GETTER( FixedSizeSegmentedReduceKernelFinal, - DeviceFixedSizeSegmentedReduceKernel) + DeviceFixedSizeSegmentedReduceKernel) CUB_RUNTIME_FUNCTION static constexpr ::cuda::std::size_t AccumSize() { @@ -694,8 +694,8 @@ template >, - typename AccumT = decltype(select_segmented_accum_t( + typename InitValueT = non_void_value_t>, + typename AccumT = decltype(select_segmented_accum_t( static_cast(nullptr))), typename PolicySelector = policy_selector_from_types, typename KernelSource = DeviceFixedSizeSegmentedReduceKernelSource< @@ -704,7 +704,7 @@ template , typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY, ::cuda::std::enable_if_t<::cuda::std::is_arithmetic_v, int> = 0> @@ -719,7 +719,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE auto dispatch_fixed_size( ::cuda::std::int64_t num_segments, OffsetT segment_size, ReductionOpT reduction_op, - InitT init, + InitValueT init, cudaStream_t stream, PolicySelector policy_selector = {}, KernelSource kernel_source = {}, diff --git a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh index 0b94dcdad34..47d089b665f 100644 --- a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh @@ -152,7 +152,7 @@ struct unzip_and_write_arg_extremum_op // The streaming reduction requires two overloads, one used for selecting the extremum within one partition and one // for selecting the extremum across partitions. // -// @tparam InitT +// @tparam InitValueT // Initial value type // // @tparam PolicySelector diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh index b8efb2c6dd0..2a7f561251d 100644 --- a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh @@ -43,14 +43,14 @@ struct empty_problem_init_t } }; -template -_CCCL_HOST_DEVICE _CCCL_FORCEINLINE InitT unwrap_empty_problem_init(InitT init) +template +_CCCL_HOST_DEVICE _CCCL_FORCEINLINE InitValueT unwrap_empty_problem_init(InitValueT init) { return init; } -template -_CCCL_HOST_DEVICE _CCCL_FORCEINLINE InitT unwrap_empty_problem_init(empty_problem_init_t empty_problem_init) +template +_CCCL_HOST_DEVICE _CCCL_FORCEINLINE InitValueT unwrap_empty_problem_init(empty_problem_init_t empty_problem_init) { return empty_problem_init.init; } @@ -63,9 +63,9 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE InitT unwrap_empty_problem_init(empty_proble * @param init Initial value * @param block_aggregate Aggregate value computed by the block */ -template +template _CCCL_HOST_DEVICE void -finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT reduction_op, InitT init, AccumT block_aggregate) +finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT reduction_op, InitValueT init, AccumT block_aggregate) { *d_out = reduction_op(init, block_aggregate); } @@ -76,9 +76,9 @@ finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT reduction_op, I * @param d_out Iterator to the output aggregate * @param block_aggregate Aggregate value computed by the block */ -template +template _CCCL_HOST_DEVICE void -finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT, empty_problem_init_t, AccumT block_aggregate) +finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT, empty_problem_init_t, AccumT block_aggregate) { *d_out = block_aggregate; } @@ -100,7 +100,7 @@ finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT, empty_problem_ * Binary reduction functor type having member * `auto operator()(const T &a, const U &b)` * - * @tparam InitT + * @tparam InitValueT * Initial value type * * @tparam AccumT @@ -192,7 +192,7 @@ __launch_bounds__(int(current_policy().reduce.threads_per_block) * Binary reduction functor type having member * `T operator()(const T &a, const U &b)` * - * @tparam InitT + * @tparam InitValueT * Initial value type * * @tparam AccumT @@ -218,7 +218,7 @@ template #if _CCCL_HAS_CONCEPTS() @@ -230,7 +230,7 @@ _CCCL_KERNEL_ATTRIBUTES __launch_bounds__( OutputIteratorT d_out, _CCCL_GRID_CONSTANT const OffsetT num_items, ReductionOpT reduction_op, - _CCCL_GRID_CONSTANT const InitT init, + _CCCL_GRID_CONSTANT const InitValueT init, TransformOpT transform_op) { static constexpr agent_reduce_policy policy = current_policy().single_tile; @@ -282,7 +282,7 @@ template #if _CCCL_HAS_CONCEPTS() requires reduce_nondeterministic::reduce_nondeterministic_policy_selector @@ -295,7 +295,7 @@ _CCCL_KERNEL_ATTRIBUTES __launch_bounds__(int( _CCCL_GRID_CONSTANT const OffsetT num_items, GridEvenShare even_share, ReductionOpT reduction_op, - _CCCL_GRID_CONSTANT const InitT init, + _CCCL_GRID_CONSTANT const InitValueT init, TransformOpT transform_op) { // todo: This static_assert fails with nvc++ CUDA compilation. diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce_deterministic.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce_deterministic.cuh index 296f51ca9ab..a74ad1775e7 100644 --- a/cub/cub/device/dispatch/kernels/kernel_reduce_deterministic.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_reduce_deterministic.cuh @@ -160,7 +160,7 @@ __launch_bounds__(int(current_policy().reduce.threads_per_block) * Binary reduction functor type having member * `T operator()(const T &a, const U &b)` * - * @tparam InitT + * @tparam InitValueT * Initial value type * * @tparam AccumT @@ -185,7 +185,7 @@ template _CCCL_KERNEL_ATTRIBUTES __launch_bounds__( @@ -194,7 +194,7 @@ _CCCL_KERNEL_ATTRIBUTES __launch_bounds__( OutputIteratorT d_out, int num_items, ReductionOpT reduction_op, - InitT init, + InitValueT init, TransformOpT transform_op) { constexpr rfa::single_tile_policy policy = current_policy().single_tile; diff --git a/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh index 03286cfe595..fb5ee1fe69a 100644 --- a/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_segmented_reduce.cuh @@ -65,7 +65,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void NormalizeReductionOutput( * Binary reduction functor type having member * `T operator()(const T &a, const U &b)` * - * @tparam InitT + * @tparam InitValueT * Initial value type * * @param[in] d_in @@ -105,7 +105,7 @@ template #if _CCCL_HAS_CONCEPTS() requires segmented_reduce_policy_selector @@ -118,7 +118,7 @@ _CCCL_KERNEL_ATTRIBUTES __launch_bounds__(current_policy().large _CCCL_GRID_CONSTANT const EndOffsetIteratorT d_end_offsets, _CCCL_GRID_CONSTANT const int num_segments, ReductionOpT reduction_op, - _CCCL_GRID_CONSTANT const InitT init, + _CCCL_GRID_CONSTANT const InitValueT init, _CCCL_GRID_CONSTANT const size_t max_segment_size) { static constexpr segmented_reduce_policy full_policy = current_policy(); @@ -275,7 +275,7 @@ _CCCL_KERNEL_ATTRIBUTES __launch_bounds__(current_policy().large * Binary reduction functor type having member * `T operator()(const T &a, const U &b)` * - * @tparam InitT + * @tparam InitValueT * Initial value type * * @param[in] d_in @@ -310,7 +310,7 @@ template #if _CCCL_HAS_CONCEPTS() requires segmented_reduce_policy_selector @@ -322,7 +322,7 @@ __launch_bounds__(current_policy().large_reduce.threads_per_bloc _CCCL_GRID_CONSTANT const OffsetT segment_size, _CCCL_GRID_CONSTANT const int num_segments, ReductionOpT reduction_op, - _CCCL_GRID_CONSTANT const InitT init, + _CCCL_GRID_CONSTANT const InitValueT init, _CCCL_GRID_CONSTANT AccumT* const d_partial_out, _CCCL_GRID_CONSTANT const int full_chunk_size, _CCCL_GRID_CONSTANT const int blocks_per_segment) diff --git a/cub/test/catch2_test_device_reduce.cu b/cub/test/catch2_test_device_reduce.cu index 57a83bf5998..cde5f1990a4 100644 --- a/cub/test/catch2_test_device_reduce.cu +++ b/cub/test/catch2_test_device_reduce.cu @@ -147,8 +147,8 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", f // Run test c2h::device_vector out_result(num_segments); auto d_out_it = thrust::raw_pointer_cast(out_result.data()); - using init_t = cub::detail::it_value_t; - device_reduce(unwrap_it(d_in_it), unwrap_it(d_out_it), num_items, reduction_op, init_t{}); + using init_value_t = cub::detail::it_value_t; + device_reduce(unwrap_it(d_in_it), unwrap_it(d_out_it), num_items, reduction_op, init_value_t{}); // Verify result REQUIRE(expected_result == out_result[0]); diff --git a/cub/test/catch2_test_device_reduce.cuh b/cub/test/catch2_test_device_reduce.cuh index a904e111d8f..1e6e06eae7d 100644 --- a/cub/test/catch2_test_device_reduce.cuh +++ b/cub/test/catch2_test_device_reduce.cuh @@ -224,7 +224,7 @@ template inline void compute_host_reference( InputItT h_in, @@ -232,7 +232,7 @@ inline void compute_host_reference( SizeItT h_sizes_begin, std::size_t num_segments, ReductionOpT reduction_op, - InitT init, + InitValueT init, ResultOutItT h_data_out) { for (std::size_t segment = 0; segment < num_segments; segment++) diff --git a/cub/test/catch2_test_device_reduce_deterministic.cu b/cub/test/catch2_test_device_reduce_deterministic.cu index 0512528d268..7618a87a32c 100644 --- a/cub/test/catch2_test_device_reduce_deterministic.cu +++ b/cub/test/catch2_test_device_reduce_deterministic.cu @@ -307,7 +307,7 @@ C2H_TEST("Deterministic Device reduce works with integral types on gpu with diff test_types) { using type = typename c2h::get<0, TestType>; - using init_t = type; + using init_value_t = type; const auto env = cuda::execution::require(cuda::execution::determinism::gpu_to_gpu); constexpr int num_items = 1 << 10; @@ -327,7 +327,7 @@ C2H_TEST("Deterministic Device reduce works with integral types on gpu with diff c2h::device_vector d_output(1); auto error = - cub::DeviceReduce::Reduce(d_input.begin(), d_output.begin(), num_items, cuda::std::plus{}, init_t{}, env); + cub::DeviceReduce::Reduce(d_input.begin(), d_output.begin(), num_items, cuda::std::plus{}, init_value_t{}, env); REQUIRE(error == cudaSuccess); c2h::host_vector h_input = d_input; @@ -336,7 +336,7 @@ C2H_TEST("Deterministic Device reduce works with integral types on gpu with diff // Requires `std::accumulate` to produce deterministic result which is required for comparison // with the device RFA result. // NOTE: `std::reduce` is not equivalent - h_expected[0] = std::accumulate(h_input.begin(), h_input.end(), init_t{}, cuda::std::plus{}); + h_expected[0] = std::accumulate(h_input.begin(), h_input.end(), init_value_t{}, cuda::std::plus{}); c2h::host_vector h_output = d_output; REQUIRE(h_expected == h_output); @@ -346,7 +346,7 @@ C2H_TEST("Deterministic Device reduce works with integral types on gpu with diff { c2h::device_vector d_output(1); - init_t init_value{}; + init_value_t init_value{}; auto error = cub::DeviceReduce::Reduce( d_input.begin(), d_output.begin(), num_items, cuda::std::bit_xor<>{}, init_value, env); @@ -364,7 +364,7 @@ C2H_TEST("Deterministic Device reduce works with integral types on gpu with diff { c2h::device_vector d_output(1); - init_t init_value{}; + init_value_t init_value{}; auto error = cub::DeviceReduce::Reduce( d_input.begin(), d_output.begin(), num_items, cuda::std::logical_or<>{}, init_value, env); @@ -383,10 +383,10 @@ C2H_TEST("Deterministic Device reduce works with integral types on gpu with diff { c2h::device_vector d_output(1); - init_t init_value{cuda::std::numeric_limits::max()}; + init_value_t init_value{cuda::std::numeric_limits::max()}; auto error = - cub::DeviceReduce::Reduce(d_input.begin(), d_output.begin(), num_items, cuda::minimum{}, init_value, env); + cub::DeviceReduce::Reduce(d_input.begin(), d_output.begin(), num_items, cuda::minimum{}, init_value, env); REQUIRE(error == cudaSuccess); c2h::host_vector h_input = d_input; @@ -401,10 +401,10 @@ C2H_TEST("Deterministic Device reduce works with integral types on gpu with diff { c2h::device_vector d_output(1); - init_t init_value{cuda::std::numeric_limits::min()}; + init_value_t init_value{cuda::std::numeric_limits::min()}; auto error = - cub::DeviceReduce::Reduce(d_input.begin(), d_output.begin(), num_items, cuda::maximum{}, init_value, env); + cub::DeviceReduce::Reduce(d_input.begin(), d_output.begin(), num_items, cuda::maximum{}, init_value, env); REQUIRE(error == cudaSuccess); c2h::host_vector h_input = d_input; diff --git a/cub/test/catch2_test_device_reduce_dispatcher.cu b/cub/test/catch2_test_device_reduce_dispatcher.cu index ce4ed8c7064..ec1270c708c 100644 --- a/cub/test/catch2_test_device_reduce_dispatcher.cu +++ b/cub/test/catch2_test_device_reduce_dispatcher.cu @@ -43,7 +43,7 @@ C2H_TEST("Dispatch reduce can be called with custom policy_hub", "[reduce][devic { using T = c2h::get<0, TestType>; using offset_t = int32_t; - using init_t = T; + using init_value_t = T; using op_t = cuda::std::plus<>; using accum_t = cuda::std::__accumulator_t; @@ -62,17 +62,17 @@ C2H_TEST("Dispatch reduce can be called with custom policy_hub", "[reduce][devic decltype(d_out_it), offset_t, op_t, - init_t, + init_value_t, accum_t, ::cuda::std::identity, policy_hub_t>; size_t temp_storage_bytes = 0; - dispatch_t::Dispatch(nullptr, temp_storage_bytes, d_in_it, d_out_it, num_items, op_t{}, init_t{}, nullptr); + dispatch_t::Dispatch(nullptr, temp_storage_bytes, d_in_it, d_out_it, num_items, op_t{}, init_value_t{}, nullptr); c2h::device_vector temp_storage(temp_storage_bytes, thrust::no_init); dispatch_t::Dispatch( - temp_storage.data().get(), temp_storage_bytes, d_in_it, d_out_it, num_items, op_t{}, init_t{}, nullptr); + temp_storage.data().get(), temp_storage_bytes, d_in_it, d_out_it, num_items, op_t{}, init_value_t{}, nullptr); // Verify result const T expected_result = static_cast(compute_single_problem_reference(in_items, op_t{}, accum_t{})); diff --git a/cub/test/catch2_test_device_reduce_env.cu b/cub/test/catch2_test_device_reduce_env.cu index d1f4345fd78..00bca4a3a1f 100644 --- a/cub/test/catch2_test_device_reduce_env.cu +++ b/cub/test/catch2_test_device_reduce_env.cu @@ -401,13 +401,13 @@ C2H_TEST("Device reduce uses environment", "[reduce][device]", requirements) using num_items_t = int; using offset_t = cub::detail::choose_offset_t; using transform_t = cuda::std::identity; - using init_t = accumulator_t; + using init_value_t = accumulator_t; num_items_t num_items = GENERATE(1 << 4, 1 << 24); auto d_in = cuda::constant_iterator(1.0f); auto d_out = thrust::device_vector(1); - init_t init = 0; + init_value_t init = 0; size_t expected_bytes_allocated{}; // To check if a given algorithm implementation is used, we check if associated kernels are invoked. @@ -427,7 +427,7 @@ C2H_TEST("Device reduce uses environment", "[reduce][device]", requirements) decltype(d_out.begin()), offset_t, op_t, - init_t, + init_value_t, accumulator_t, transform_t>), reinterpret_cast( @@ -439,7 +439,7 @@ C2H_TEST("Device reduce uses environment", "[reduce][device]", requirements) decltype(d_out.begin()), int, // always used with int offset op_t, - init_t, + init_value_t, accumulator_t>)}; } else if constexpr (cub::detail::is_non_deterministic_v) @@ -467,7 +467,7 @@ C2H_TEST("Device reduce uses environment", "[reduce][device]", requirements) decltype(raw_ptr), offset_t, op_t, - init_t, + init_value_t, accumulator_t, transform_t>)}; } @@ -481,7 +481,7 @@ C2H_TEST("Device reduce uses environment", "[reduce][device]", requirements) REQUIRE(cudaSuccess == cub::detail::rfa:: - dispatch( + dispatch( nullptr, expected_bytes_allocated, d_in, d_out.begin(), num_items, init)); auto k1 = cub::detail::reduce::DeterministicDeviceReduceSingleTileKernel< @@ -489,7 +489,7 @@ C2H_TEST("Device reduce uses environment", "[reduce][device]", requirements) decltype(d_in), output_it_t, reduction_op_t, - init_t, + init_value_t, deterministic_accum_t, transform_t>; auto k2 = cub::detail::reduce:: @@ -499,7 +499,7 @@ C2H_TEST("Device reduce uses environment", "[reduce][device]", requirements) deterministic_accum_t*, output_it_t, reduction_op_t, - init_t, + init_value_t, deterministic_accum_t, transform_t>; // TODO(bgruber): enable this when we have Catch2 3.13+ @@ -529,13 +529,13 @@ C2H_TEST("Device sum uses environment", "[reduce][device]", requirements) using num_items_t = int; using offset_t = cub::detail::choose_offset_t; using transform_t = cuda::std::identity; - using init_t = accumulator_t; + using init_value_t = accumulator_t; num_items_t num_items = GENERATE(1 << 4, 1 << 24); auto d_in = cuda::constant_iterator(1.0f); auto d_out = thrust::device_vector(1); - [[maybe_unused]] init_t init = 0; + [[maybe_unused]] init_value_t init = 0; size_t expected_bytes_allocated{}; // To check if a given algorithm implementation is used, we check if associated kernels are invoked. @@ -553,7 +553,7 @@ C2H_TEST("Device sum uses environment", "[reduce][device]", requirements) decltype(d_out.begin()), offset_t, op_t, - init_t, + init_value_t, accumulator_t, transform_t>), reinterpret_cast( @@ -565,7 +565,7 @@ C2H_TEST("Device sum uses environment", "[reduce][device]", requirements) decltype(d_out.begin()), int, // always used with int offset op_t, - init_t, + init_value_t, accumulator_t>)}; } else if constexpr (cub::detail::is_non_deterministic_v) @@ -593,7 +593,7 @@ C2H_TEST("Device sum uses environment", "[reduce][device]", requirements) decltype(raw_ptr), offset_t, op_t, - init_t, + init_value_t, accumulator_t, transform_t>)}; } @@ -607,7 +607,7 @@ C2H_TEST("Device sum uses environment", "[reduce][device]", requirements) REQUIRE(cudaSuccess == cub::detail::rfa:: - dispatch( + dispatch( nullptr, expected_bytes_allocated, d_in, d_out.begin(), num_items, init)); auto k1 = cub::detail::reduce::DeterministicDeviceReduceSingleTileKernel< @@ -615,7 +615,7 @@ C2H_TEST("Device sum uses environment", "[reduce][device]", requirements) decltype(d_in), output_it_t, reduction_op_t, - init_t, + init_value_t, deterministic_accum_t, transform_t>; auto k2 = cub::detail::reduce:: @@ -625,7 +625,7 @@ C2H_TEST("Device sum uses environment", "[reduce][device]", requirements) deterministic_accum_t*, output_it_t, reduction_op_t, - init_t, + init_value_t, deterministic_accum_t, transform_t>; // TODO(bgruber): enable this when we have Catch2 3.13+ @@ -653,7 +653,7 @@ C2H_TEST("Device reduce not_guaranteed falls back when output type differs from using output_t = cuda::std::uint8_t; using accumulator_t = int; using op_t = cuda::std::plus<>; - using init_t = input_t; + using init_value_t = input_t; using num_items_t = int; using offset_t = cub::detail::choose_offset_t; using transform_t = cuda::std::identity; @@ -661,7 +661,7 @@ C2H_TEST("Device reduce not_guaranteed falls back when output type differs from auto d_in = thrust::device_vector{0, 1, 2, 3}; auto d_out = thrust::device_vector(1); num_items_t num_items = static_cast(d_in.size()); - init_t init{}; + init_value_t init{}; size_t expected_bytes_allocated{}; REQUIRE(cudaSuccess @@ -677,7 +677,7 @@ C2H_TEST("Device reduce not_guaranteed falls back when output type differs from decltype(d_out.begin()), offset_t, op_t, - init_t, + init_value_t, accumulator_t, transform_t>), reinterpret_cast( @@ -690,7 +690,7 @@ C2H_TEST("Device reduce not_guaranteed falls back when output type differs from decltype(d_out.begin()), int, // always used with int offset op_t, - init_t, + init_value_t, accumulator_t>)}; auto env = stdexec::env{cuda::execution::require(cuda::execution::determinism::not_guaranteed), @@ -708,7 +708,7 @@ C2H_TEST("Device sum not_guaranteed falls back when output type differs from acc using output_t = cuda::std::uint8_t; using accumulator_t = int; using op_t = cuda::std::plus<>; - using init_t = output_t; + using init_value_t = output_t; using num_items_t = int; using offset_t = cub::detail::choose_offset_t; using transform_t = cuda::std::identity; @@ -730,7 +730,7 @@ C2H_TEST("Device sum not_guaranteed falls back when output type differs from acc decltype(d_out.begin()), offset_t, op_t, - init_t, + init_value_t, accumulator_t, transform_t>), reinterpret_cast( @@ -743,7 +743,7 @@ C2H_TEST("Device sum not_guaranteed falls back when output type differs from acc decltype(d_out.begin()), int, // always used with int offset op_t, - init_t, + init_value_t, accumulator_t>)}; auto env = stdexec::env{cuda::execution::require(cuda::execution::determinism::not_guaranteed), diff --git a/cub/test/catch2_test_device_reduce_iterators.cu b/cub/test/catch2_test_device_reduce_iterators.cu index 2029c2ca6f0..ed494c52e8d 100644 --- a/cub/test/catch2_test_device_reduce_iterators.cu +++ b/cub/test/catch2_test_device_reduce_iterators.cu @@ -72,19 +72,19 @@ C2H_TEST("Device reduce works with fancy input iterators", "[reduce][device]", i auto in_it = cuda::constant_iterator(default_constant); using op_t = cuda::std::plus<>; - using init_t = output_t; + using init_value_t = output_t; // Binary reduction operator auto reduction_op = op_t{}; // Prepare verification data - using accum_t = cuda::std::__accumulator_t; + using accum_t = cuda::std::__accumulator_t; output_t expected_result = compute_single_problem_reference(in_it, in_it + num_items, reduction_op, accum_t{}); // Run test c2h::device_vector out_result(num_segments); auto d_out_it = thrust::raw_pointer_cast(out_result.data()); - device_reduce(in_it, d_out_it, num_items, reduction_op, init_t{}); + device_reduce(in_it, d_out_it, num_items, reduction_op, init_value_t{}); // Verify result REQUIRE(expected_result == out_result[0]); @@ -111,11 +111,11 @@ C2H_TEST("Device reduce compiles with discard output iterator", "[reduce][device auto in_it = cuda::constant_iterator(default_constant); using op_t = cuda::std::plus<>; - using init_t = output_t; + using init_value_t = output_t; // Binary reduction operator auto reduction_op = op_t{}; // Run test - device_reduce(in_it, cuda::discard_iterator(), num_items, reduction_op, init_t{}); + device_reduce(in_it, cuda::discard_iterator(), num_items, reduction_op, init_value_t{}); } diff --git a/cub/test/catch2_test_device_reduce_nondeterministic.cu b/cub/test/catch2_test_device_reduce_nondeterministic.cu index 6ed2ca4b4c0..a56dcee1cd7 100644 --- a/cub/test/catch2_test_device_reduce_nondeterministic.cu +++ b/cub/test/catch2_test_device_reduce_nondeterministic.cu @@ -218,14 +218,14 @@ C2H_TEST("Nondeterministic Device reduce works with float and double on gpu with auto* raw_ptr = thrust::raw_pointer_cast(d_output.data()); using output_it_t = decltype(raw_ptr); - using init_t = type; + using init_value_t = type; using accum_t = type; using transform_t = square_t; std::size_t temp_storage_bytes{}; auto error = cub::detail::reduce_nondeterministic::dispatch( - nullptr, temp_storage_bytes, input, raw_ptr, num_items, cuda::std::plus{}, init_t{}, nullptr, transform_t{}); + nullptr, temp_storage_bytes, input, raw_ptr, num_items, cuda::std::plus{}, init_value_t{}, nullptr, transform_t{}); REQUIRE(error == cudaSuccess); c2h::device_vector temp_storage(temp_storage_bytes, thrust::no_init); @@ -237,7 +237,7 @@ C2H_TEST("Nondeterministic Device reduce works with float and double on gpu with raw_ptr, num_items, cuda::std::plus{}, - init_t{}, + init_value_t{}, nullptr, transform_t{}); REQUIRE(error == cudaSuccess); diff --git a/cub/test/catch2_test_device_scan.cu b/cub/test/catch2_test_device_scan.cu index 177086393ae..795bd0ca6e8 100644 --- a/cub/test/catch2_test_device_scan.cu +++ b/cub/test/catch2_test_device_scan.cu @@ -262,8 +262,8 @@ C2H_TEST("Device scan works with all device interfaces", "[scan][device]", full_ // Run test c2h::device_vector out_result(num_items); auto d_out_it = thrust::raw_pointer_cast(out_result.data()); - using init_t = cub::detail::it_value_t; - device_exclusive_scan(unwrap_it(d_in_it), unwrap_it(d_out_it), scan_op, init_t{}, num_items); + using init_value_t = cub::detail::it_value_t; + device_exclusive_scan(unwrap_it(d_in_it), unwrap_it(d_out_it), scan_op, init_value_t{}, num_items); // Verify result REQUIRE_THAT_QUIET(expected_result, Equals(out_result)); @@ -271,7 +271,7 @@ C2H_TEST("Device scan works with all device interfaces", "[scan][device]", full_ // Run test in-place if constexpr (std::is_same_v) { - device_exclusive_scan(unwrap_it(d_in_it), unwrap_it(d_in_it), scan_op, init_t{}, num_items); + device_exclusive_scan(unwrap_it(d_in_it), unwrap_it(d_in_it), scan_op, init_value_t{}, num_items); // Verify result REQUIRE_THAT_QUIET(expected_result, Equals(in_items)); @@ -298,10 +298,10 @@ C2H_TEST("Device scan works with all device interfaces", "[scan][device]", full_ // Run test c2h::device_vector out_result(num_items); auto d_out_it = thrust::raw_pointer_cast(out_result.data()); - using init_t = cub::detail::it_value_t; - c2h::device_vector d_initial_value(1); - d_initial_value[0] = static_cast(*unwrap_it(&init_value)); - auto future_init_value = cub::FutureValue(thrust::raw_pointer_cast(d_initial_value.data())); + using init_value_t = cub::detail::it_value_t; + c2h::device_vector d_initial_value(1); + d_initial_value[0] = static_cast(*unwrap_it(&init_value)); + auto future_init_value = cub::FutureValue(thrust::raw_pointer_cast(d_initial_value.data())); device_exclusive_scan(unwrap_it(d_in_it), unwrap_it(d_out_it), scan_op, future_init_value, num_items); // Verify result diff --git a/cub/test/catch2_test_device_scan.cuh b/cub/test/catch2_test_device_scan.cuh index 5aeda0cbcbc..bb9b52525bd 100644 --- a/cub/test/catch2_test_device_scan.cuh +++ b/cub/test/catch2_test_device_scan.cuh @@ -36,11 +36,11 @@ struct Mod2Equality } }; -template -void compute_exclusive_scan_reference(InputIt first, InputIt last, OutputIt result, InitT init, BinaryOp op) +template +void compute_exclusive_scan_reference(InputIt first, InputIt last, OutputIt result, InitValueT init, BinaryOp op) { using value_t = cub::detail::it_value_t; - using accum_t = ::cuda::std::__accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; using output_t = cub::detail::it_value_t; accum_t acc = static_cast(init); for (; first != last; ++first) @@ -51,11 +51,11 @@ void compute_exclusive_scan_reference(InputIt first, InputIt last, OutputIt resu } } -template -void compute_inclusive_scan_reference(InputIt first, InputIt last, OutputIt result, BinaryOp op, InitT init) +template +void compute_inclusive_scan_reference(InputIt first, InputIt last, OutputIt result, BinaryOp op, InitValueT init) { using value_t = cub::detail::it_value_t; - using accum_t = ::cuda::std::__accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; using output_t = cub::detail::it_value_t; accum_t acc = static_cast(init); for (; first != last; ++first) @@ -70,18 +70,18 @@ template + typename InitValueT> void compute_exclusive_scan_by_key_reference( ValueInItT h_values_it, KeyInItT h_keys_it, ValuesOutItT result_out_it, ScanOpT scan_op, EqualityOpT equality_op, - InitT init, + InitValueT init, std::size_t num_items) { using value_t = cub::detail::it_value_t; - using accum_t = ::cuda::std::__accumulator_t; + using accum_t = ::cuda::std::__accumulator_t; using output_t = cub::detail::it_value_t; if (num_items > 0) @@ -104,14 +104,14 @@ void compute_exclusive_scan_by_key_reference( } } -template +template void compute_exclusive_scan_by_key_reference( const c2h::device_vector& d_values, const c2h::device_vector& d_keys, ValuesOutItT result_out_it, ScanOpT scan_op, EqualityOpT equality_op, - InitT init) + InitValueT init) { c2h::host_vector host_values(d_values); c2h::host_vector host_keys(d_keys); diff --git a/cub/test/catch2_test_device_scan_by_key.cu b/cub/test/catch2_test_device_scan_by_key.cu index 5cedd9084d8..bbc25d518f2 100644 --- a/cub/test/catch2_test_device_scan_by_key.cu +++ b/cub/test/catch2_test_device_scan_by_key.cu @@ -219,9 +219,9 @@ C2H_TEST("Device scan works with all device interfaces", "[by_key][scan][device] // Run test c2h::device_vector out_values(num_items); auto d_values_out_it = thrust::raw_pointer_cast(out_values.data()); - using init_t = cub::detail::it_value_t; + using init_value_t = cub::detail::it_value_t; device_exclusive_scan_by_key( - d_keys_it, unwrap_it(d_values_it), unwrap_it(d_values_out_it), scan_op, init_t{}, num_items, eq_op_t{}); + d_keys_it, unwrap_it(d_values_it), unwrap_it(d_values_out_it), scan_op, init_value_t{}, num_items, eq_op_t{}); // Verify result REQUIRE(expected_result == out_values); @@ -234,7 +234,7 @@ C2H_TEST("Device scan works with all device interfaces", "[by_key][scan][device] out_values = in_values; auto values_in_out_it = thrust::raw_pointer_cast(out_values.data()); device_exclusive_scan_by_key( - d_keys_it, unwrap_it(values_in_out_it), unwrap_it(values_in_out_it), scan_op, init_t{}, num_items, eq_op_t{}); + d_keys_it, unwrap_it(values_in_out_it), unwrap_it(values_in_out_it), scan_op, init_value_t{}, num_items, eq_op_t{}); // Verify result REQUIRE(expected_result == out_values); @@ -358,9 +358,9 @@ C2H_TEST("Device scan works when memory for keys and results alias one another", // Run test auto d_values_out_it = d_keys_it; - using init_t = value_t; + using init_value_t = value_t; device_exclusive_scan_by_key( - d_keys_it, d_values_it, d_values_out_it, scan_op, init_t{}, num_items, cuda::std::equal_to<>{}); + d_keys_it, d_values_it, d_values_out_it, scan_op, init_value_t{}, num_items, cuda::std::equal_to<>{}); // Verify result REQUIRE(expected_result == segment_keys); diff --git a/cub/test/catch2_test_device_scan_by_key_iterators.cu b/cub/test/catch2_test_device_scan_by_key_iterators.cu index 5204284ed5f..adc848cef3c 100644 --- a/cub/test/catch2_test_device_scan_by_key_iterators.cu +++ b/cub/test/catch2_test_device_scan_by_key_iterators.cu @@ -173,8 +173,8 @@ C2H_TEST("Device scan works with fancy iterators", "[by_key][scan][device]", ful // Run test c2h::device_vector out_values(num_items); - using init_t = value_t; - device_exclusive_scan_by_key(d_keys_it, values_in_it, out_values.begin(), scan_op, init_t{}, num_items, eq_op_t{}); + using init_value_t = value_t; + device_exclusive_scan_by_key(d_keys_it, values_in_it, out_values.begin(), scan_op, init_value_t{}, num_items, eq_op_t{}); // Verify result REQUIRE(expected_result == out_values); diff --git a/cub/test/catch2_test_device_scan_env.cu b/cub/test/catch2_test_device_scan_env.cu index 0531f32e278..394af4a48c0 100644 --- a/cub/test/catch2_test_device_scan_env.cu +++ b/cub/test/catch2_test_device_scan_env.cu @@ -353,9 +353,9 @@ C2H_TEST("Device scan exclusive-scan uses environment", "[scan][device]") auto d_in = cuda::constant_iterator(1.0f); auto d_out = c2h::device_vector(num_items); - using init_t = float; + using init_value_t = float; - init_t init{42.0f}; + init_value_t init{42.0f}; size_t expected_bytes_allocated{}; REQUIRE(cudaSuccess diff --git a/cub/test/catch2_test_device_scan_invalid.cu b/cub/test/catch2_test_device_scan_invalid.cu index 5cb8fd2dfeb..2b75eddc655 100644 --- a/cub/test/catch2_test_device_scan_invalid.cu +++ b/cub/test/catch2_test_device_scan_invalid.cu @@ -260,9 +260,9 @@ C2H_TEST("Device scan avoids invalid data with all device interfaces", "[scan][d // Run test c2h::device_vector out_result(num_items); const auto d_out_it = thrust::raw_pointer_cast(out_result.data()); - using init_t = output_t; - c2h::device_vector d_initial_value{init_value}; - const auto future_init_value = cub::FutureValue(thrust::raw_pointer_cast(d_initial_value.data())); + using init_value_t = output_t; + c2h::device_vector d_initial_value{init_value}; + const auto future_init_value = cub::FutureValue(thrust::raw_pointer_cast(d_initial_value.data())); device_exclusive_scan(d_in_it, d_out_it, scan_op, future_init_value, num_items); const counts h_counts = error_counts.front(); diff --git a/cub/test/catch2_test_device_scan_iterators.cu b/cub/test/catch2_test_device_scan_iterators.cu index 1a69cba5212..a4b02560cab 100644 --- a/cub/test/catch2_test_device_scan_iterators.cu +++ b/cub/test/catch2_test_device_scan_iterators.cu @@ -143,10 +143,10 @@ C2H_TEST("Device scan works with iterators", "[scan][device]", iterator_type_lis // Run test c2h::device_vector out_result(num_items); auto d_out_it = thrust::raw_pointer_cast(out_result.data()); - using init_t = cub::detail::it_value_t; - c2h::device_vector d_initial_value(1); - d_initial_value[0] = static_cast(init_value); - auto future_init_value = cub::FutureValue(thrust::raw_pointer_cast(d_initial_value.data())); + using init_value_t = cub::detail::it_value_t; + c2h::device_vector d_initial_value(1); + d_initial_value[0] = static_cast(init_value); + auto future_init_value = cub::FutureValue(thrust::raw_pointer_cast(d_initial_value.data())); device_exclusive_scan(in_it, d_out_it, op_t{}, future_init_value, num_items); // Verify result diff --git a/cub/test/catch2_test_device_segmented_reduce.cu b/cub/test/catch2_test_device_segmented_reduce.cu index 33aa1d9a54b..7650056a139 100644 --- a/cub/test/catch2_test_device_segmented_reduce.cu +++ b/cub/test/catch2_test_device_segmented_reduce.cu @@ -115,9 +115,9 @@ C2H_TEST("Device reduce works with all device interfaces", "[segmented][reduce][ // Run test c2h::device_vector out_result(num_segments); auto d_out_it = thrust::raw_pointer_cast(out_result.data()); - using init_t = cub::detail::it_value_t; + using init_value_t = cub::detail::it_value_t; device_segmented_reduce( - unwrap_it(d_in_it), unwrap_it(d_out_it), num_segments, d_offsets_it, d_offsets_it + 1, reduction_op, init_t{}); + unwrap_it(d_in_it), unwrap_it(d_out_it), num_segments, d_offsets_it, d_offsets_it + 1, reduction_op, init_value_t{}); // Verify result REQUIRE(expected_result == out_result); @@ -265,8 +265,8 @@ C2H_TEST("Device fixed size segmented reduce works with all device interfaces", c2h::device_vector out_result(num_segments); auto d_out_it = thrust::raw_pointer_cast(out_result.data()); - using init_t = cub::detail::it_value_t; - init_t init = static_cast(*unwrap_it(&default_constant)); + using init_value_t = cub::detail::it_value_t; + init_value_t init = static_cast(*unwrap_it(&default_constant)); device_segmented_reduce(unwrap_it(d_in_it), unwrap_it(d_out_it), num_segments, segment_size, reduction_op, init); // Verify result REQUIRE(expected_result == out_result); diff --git a/cub/test/catch2_test_device_segmented_reduce_iterators.cu b/cub/test/catch2_test_device_segmented_reduce_iterators.cu index f8d89293d5b..b177d04f7a2 100644 --- a/cub/test/catch2_test_device_segmented_reduce_iterators.cu +++ b/cub/test/catch2_test_device_segmented_reduce_iterators.cu @@ -60,20 +60,20 @@ C2H_TEST("Device segmented reduce works with fancy input iterators", "[reduce][d auto in_it = cuda::constant_iterator(default_constant); using op_t = cuda::std::plus<>; - using init_t = output_t; + using init_value_t = output_t; // Binary reduction operator auto reduction_op = op_t{}; // Prepare verification data - using accum_t = cuda::std::__accumulator_t; + using accum_t = cuda::std::__accumulator_t; c2h::host_vector expected_result(num_segments); compute_segmented_problem_reference(in_it, segment_offsets, reduction_op, accum_t{}, expected_result.begin()); // Run test c2h::device_vector out_result(num_segments); auto d_out_it = thrust::raw_pointer_cast(out_result.data()); - device_segmented_reduce(in_it, d_out_it, num_segments, d_offsets_it, d_offsets_it + 1, reduction_op, init_t{}); + device_segmented_reduce(in_it, d_out_it, num_segments, d_offsets_it, d_offsets_it + 1, reduction_op, init_value_t{}); // Verify result REQUIRE(expected_result == out_result); diff --git a/cub/test/catch2_test_device_segmented_reduce_max_seg_size.cu b/cub/test/catch2_test_device_segmented_reduce_max_seg_size.cu index da1261dca2a..67565208f1f 100644 --- a/cub/test/catch2_test_device_segmented_reduce_max_seg_size.cu +++ b/cub/test/catch2_test_device_segmented_reduce_max_seg_size.cu @@ -29,7 +29,7 @@ C2H_TEST("Device segmented reduce works with dynamic max segment sizes", using op_t = cuda::std::plus<>; using accum_t = cuda::std::__accumulator_t; - using init_t = input_t; + using init_value_t = input_t; cuda::compute_capability cc{}; REQUIRE(cudaSuccess == cub::detail::ptx_compute_cap(cc)); @@ -97,7 +97,7 @@ C2H_TEST("Device segmented reduce works with dynamic max segment sizes", d_offsets_it, d_offsets_it + 1, op_t{}, - init_t{}, + init_value_t{}, guaranteed_max_seg_size, nullptr, cub::detail::segmented_reduce::policy_selector_from_types{}); @@ -114,7 +114,7 @@ C2H_TEST("Device segmented reduce works with dynamic max segment sizes", d_offsets_it, d_offsets_it + 1, op_t{}, - init_t{}, + init_value_t{}, guaranteed_max_seg_size, nullptr, cub::detail::segmented_reduce::policy_selector_from_types{}); diff --git a/cub/test/catch2_test_device_transform_reduce.cu b/cub/test/catch2_test_device_transform_reduce.cu index 7d9ba675b5b..1609853c88c 100644 --- a/cub/test/catch2_test_device_transform_reduce.cu +++ b/cub/test/catch2_test_device_transform_reduce.cu @@ -31,7 +31,7 @@ struct square_t C2H_TEST("Device transform reduce works with pointers", "[reduce][device]", types) { using item_t = c2h::get<0, TestType>; - using init_t = item_t; + using init_value_t = item_t; using offset_t = std::int32_t; using reduction_op_t = cuda::std::plus<>; using transform_op_t = square_t; @@ -78,7 +78,7 @@ C2H_TEST("Device transform reduce works with pointers", "[reduce][device]", type C2H_TEST("Device transform reduce works with iterators", "[reduce][device]", types) { using item_t = c2h::get<0, TestType>; - using init_t = item_t; + using init_value_t = item_t; using offset_t = std::int32_t; using reduction_op_t = cuda::std::plus<>; using transform_op_t = square_t; @@ -92,7 +92,7 @@ C2H_TEST("Device transform reduce works with iterators", "[reduce][device]", typ c2h::device_vector in(num_items, magic_val); c2h::device_vector out(1); - device_transform_reduce(in.begin(), out.begin(), num_items, reduction_op_t{}, transform_op_t{}, init_t{}); + device_transform_reduce(in.begin(), out.begin(), num_items, reduction_op_t{}, transform_op_t{}, init_value_t{}); const item_t expected = num_items * magic_val * magic_val; const item_t actual = out[0]; @@ -113,7 +113,7 @@ struct transformed_input_t std::uint64_t b; }; -struct init_t +struct init_value_t { char a; char b; @@ -134,7 +134,7 @@ struct accum_t , b{other.b} {} - __host__ __device__ accum_t(const init_t& other) + __host__ __device__ accum_t(const init_value_t& other) : a{static_cast(other.a)} , b{static_cast(other.b)} {} @@ -162,7 +162,7 @@ struct output_t , b{other.b} {} - __host__ __device__ output_t(const init_t& other) + __host__ __device__ output_t(const init_value_t& other) : a{static_cast(other.a)} , b{static_cast(other.b)} {} @@ -194,7 +194,7 @@ C2H_TEST("Device transform reduce doesn't let input type into reduction op", "[r const int num_items = GENERATE_COPY(take(3, random(min_items, max_items))); - const init_t init{3, 3}; + const init_value_t init{3, 3}; const input_t magic_val{2, 2}; c2h::device_vector in(num_items, magic_val); From d2378983d6bd97bd19ca4cefdd2af0b28918c884 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Thu, 4 Jun 2026 18:13:00 -0700 Subject: [PATCH 2/3] format --- cub/benchmarks/bench/reduce/base.cuh | 9 ++- .../bench/reduce/nondeterministic.cu | 11 ++- cub/benchmarks/bench/scan/exclusive/base.cuh | 2 +- .../bench/scan/exclusive/deterministic.cu | 6 +- .../bench/segmented_reduce/base.cuh | 6 +- cub/benchmarks/bench/transform_reduce/sum.cu | 2 +- cub/cub/device/device_reduce.cuh | 19 +++-- cub/cub/device/device_scan.cuh | 5 +- cub/cub/device/device_segmented_reduce.cuh | 77 +++++++++++-------- cub/cub/device/dispatch/dispatch_reduce.cuh | 71 +++++++++-------- .../dispatch_reduce_deterministic.cuh | 6 +- .../dispatch_reduce_nondeterministic.cuh | 41 +++++----- .../dispatch/dispatch_segmented_reduce.cuh | 52 +++++++------ .../device/dispatch/kernels/kernel_reduce.cuh | 7 +- cub/test/catch2_test_device_reduce.cu | 4 +- ...catch2_test_device_reduce_deterministic.cu | 14 ++-- .../catch2_test_device_reduce_dispatcher.cu | 10 +-- cub/test/catch2_test_device_reduce_env.cu | 8 +- .../catch2_test_device_reduce_iterators.cu | 4 +- ...ch2_test_device_reduce_nondeterministic.cu | 18 +++-- cub/test/catch2_test_device_scan.cu | 8 +- cub/test/catch2_test_device_scan_by_key.cu | 12 ++- ...atch2_test_device_scan_by_key_iterators.cu | 3 +- cub/test/catch2_test_device_scan_invalid.cu | 2 +- cub/test/catch2_test_device_scan_iterators.cu | 4 +- .../catch2_test_device_segmented_reduce.cu | 12 ++- ..._test_device_segmented_reduce_iterators.cu | 2 +- ...st_device_segmented_reduce_max_seg_size.cu | 6 +- .../catch2_test_device_transform_reduce.cu | 4 +- 29 files changed, 249 insertions(+), 176 deletions(-) diff --git a/cub/benchmarks/bench/reduce/base.cuh b/cub/benchmarks/bench/reduce/base.cuh index f6289215a53..fb2bc1c0e88 100644 --- a/cub/benchmarks/bench/reduce/base.cuh +++ b/cub/benchmarks/bench/reduce/base.cuh @@ -53,7 +53,14 @@ void reduce(nvbench::state& state, nvbench::type_list) #endif // !TUNE_BASE ); _CCCL_TRY_CUDA_API( - cub::DeviceReduce::Reduce, "Reduce failed", d_in, d_out, static_cast(elements), op_t{}, init_value_t{}, env); + cub::DeviceReduce::Reduce, + "Reduce failed", + d_in, + d_out, + static_cast(elements), + op_t{}, + init_value_t{}, + env); }); } diff --git a/cub/benchmarks/bench/reduce/nondeterministic.cu b/cub/benchmarks/bench/reduce/nondeterministic.cu index dd2e044e458..220d252b930 100644 --- a/cub/benchmarks/bench/reduce/nondeterministic.cu +++ b/cub/benchmarks/bench/reduce/nondeterministic.cu @@ -38,7 +38,7 @@ struct policy_selector template void nondeterministic_sum(nvbench::state& state, nvbench::type_list) { - using op_t = cuda::std::plus<>; + using op_t = cuda::std::plus<>; using init_value_t = T; // Retrieve axis parameters @@ -67,7 +67,14 @@ void nondeterministic_sum(nvbench::state& state, nvbench::type_list) #endif // !TUNE_BASE ); _CCCL_TRY_CUDA_API( - cub::DeviceReduce::Reduce, "Reduce failed", d_in, d_out, static_cast(elements), op_t{}, init_value_t{}, env); + cub::DeviceReduce::Reduce, + "Reduce failed", + d_in, + d_out, + static_cast(elements), + op_t{}, + init_value_t{}, + env); }); } diff --git a/cub/benchmarks/bench/scan/exclusive/base.cuh b/cub/benchmarks/bench/scan/exclusive/base.cuh index 5c6931485e6..fcb98d80813 100644 --- a/cub/benchmarks/bench/scan/exclusive/base.cuh +++ b/cub/benchmarks/bench/scan/exclusive/base.cuh @@ -15,7 +15,7 @@ template static void basic(nvbench::state& state, nvbench::type_list) try { - using init_value_t = T; + using init_value_t = T; using accum_t [[maybe_unused]] = ::cuda::std::__accumulator_t; using offset_t = cub::detail::choose_offset_t; #if USES_WARPSPEED() diff --git a/cub/benchmarks/bench/scan/exclusive/deterministic.cu b/cub/benchmarks/bench/scan/exclusive/deterministic.cu index 95980eb9af0..9d10274c9a1 100644 --- a/cub/benchmarks/bench/scan/exclusive/deterministic.cu +++ b/cub/benchmarks/bench/scan/exclusive/deterministic.cu @@ -13,9 +13,9 @@ template static void exclusive_scan(nvbench::state& state, nvbench::type_list) try { - using init_value_t = T; - using offset_t = OffsetT; - using scan_op_t = ::cuda::std::plus; + using init_value_t = T; + using offset_t = OffsetT; + using scan_op_t = ::cuda::std::plus; const auto elements = static_cast(state.get_int64("Elements{io}")); diff --git a/cub/benchmarks/bench/segmented_reduce/base.cuh b/cub/benchmarks/bench/segmented_reduce/base.cuh index a7550fb3063..cacbcb49bf5 100644 --- a/cub/benchmarks/bench/segmented_reduce/base.cuh +++ b/cub/benchmarks/bench/segmented_reduce/base.cuh @@ -44,9 +44,9 @@ void fixed_size_segmented_reduce(nvbench::state& state, nvbench::type_list) { static constexpr bool is_argmin = std::is_same_v; - using output_t = cuda::std::conditional_t, T>; - using accum_t = output_t; - using init_value_t = cuda::std::conditional_t, T>; + using output_t = cuda::std::conditional_t, T>; + using accum_t = output_t; + using init_value_t = cuda::std::conditional_t, T>; // Retrieve axis parameters const size_t num_elements = static_cast(state.get_int64("Elements{io}")); diff --git a/cub/benchmarks/bench/transform_reduce/sum.cu b/cub/benchmarks/bench/transform_reduce/sum.cu index 56daef92200..b657dd83701 100644 --- a/cub/benchmarks/bench/transform_reduce/sum.cu +++ b/cub/benchmarks/bench/transform_reduce/sum.cu @@ -37,7 +37,7 @@ struct square_t template void reduce(nvbench::state& state, nvbench::type_list) { - using init_value_t = T; + using init_value_t = T; using reduction_op_t = ::cuda::std::plus<>; using transform_op_t = square_t; diff --git a/cub/cub/device/device_reduce.cuh b/cub/cub/device/device_reduce.cuh index 99a0a22a229..67ec6f37bdb 100644 --- a/cub/cub/device/device_reduce.cuh +++ b/cub/cub/device/device_reduce.cuh @@ -254,7 +254,12 @@ private: typename NumItemsT, typename EnvT> [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t __minmax_reduce( - InputIteratorT d_in, OutputIteratorT d_out, NumItemsT num_items, ReductionOpT reduction_op, InitValueT init, EnvT env) + InputIteratorT d_in, + OutputIteratorT d_out, + NumItemsT num_items, + ReductionOpT reduction_op, + InitValueT init, + EnvT env) { static_assert(!::cuda::std::execution::__queryable_with, "Determinism should be used inside requires to have an effect."); @@ -765,10 +770,10 @@ public: { _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Min"); - using OffsetT = detail::choose_offset_t; // Signed integer type for global offsets - using InputT = detail::it_value_t; + using OffsetT = detail::choose_offset_t; // Signed integer type for global offsets + using InputT = detail::it_value_t; using init_value_t = InputT; - using limits_t = ::cuda::std::numeric_limits; + using limits_t = ::cuda::std::numeric_limits; #ifndef CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX static_assert(limits_t::is_specialized, "cub::DeviceReduce::Min uses cuda::std::numeric_limits::max() as initial " @@ -1398,10 +1403,10 @@ public: _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceReduce::Max"); // Signed integer type for global offsets - using OffsetT = detail::choose_offset_t; - using InputT = detail::it_value_t; + using OffsetT = detail::choose_offset_t; + using InputT = detail::it_value_t; using init_value_t = InputT; - using limits_t = ::cuda::std::numeric_limits; + using limits_t = ::cuda::std::numeric_limits; #ifndef CCCL_SUPPRESS_NUMERIC_LIMITS_CHECK_IN_CUB_DEVICE_REDUCE_MIN_MAX static_assert(limits_t::is_specialized, "cub::DeviceReduce::Max uses cuda::std::numeric_limits::lowest() as " diff --git a/cub/cub/device/device_scan.cuh b/cub/cub/device/device_scan.cuh index 74fec0a43bd..b8f30043dbc 100644 --- a/cub/cub/device/device_scan.cuh +++ b/cub/cub/device/device_scan.cuh @@ -329,7 +329,7 @@ struct DeviceScan _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceScan::ExclusiveSum"); // Unsigned integer type for global offsets - using OffsetT = detail::choose_offset_t; + using OffsetT = detail::choose_offset_t; using init_value_t = cub::detail::it_value_t; // Initial value @@ -415,7 +415,8 @@ struct DeviceScan using init_value_t = cub::detail::it_value_t; init_value_t init_value{}; - return scan_impl_env(d_in, d_out, ::cuda::std::plus<>{}, detail::InputValue(init_value), num_items, env); + return scan_impl_env( + d_in, d_out, ::cuda::std::plus<>{}, detail::InputValue(init_value), num_items, env); } //! @rst diff --git a/cub/cub/device/device_segmented_reduce.cuh b/cub/cub/device/device_segmented_reduce.cuh index c7b1dc97b29..d00f36c67c1 100644 --- a/cub/cub/device/device_segmented_reduce.cuh +++ b/cub/cub/device/device_segmented_reduce.cuh @@ -93,7 +93,7 @@ private: using input_value_t = cub::detail::it_value_t; using output_tuple_t = cub::detail::non_void_value_t>; using accum_t = output_tuple_t; - using init_value_t = detail::reduce::empty_problem_init_t; + using init_value_t = detail::reduce::empty_problem_init_t; using output_key_t = typename output_tuple_t::first_type; using output_value_t = typename output_tuple_t::second_type; @@ -830,9 +830,9 @@ public: { _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Sum"); - using OffsetT = detail::common_iterator_value_t; - using OutputT = detail::non_void_value_t>; - using init_value_t = OutputT; + using OffsetT = detail::common_iterator_value_t; + using OutputT = detail::non_void_value_t>; + using init_value_t = OutputT; static_assert(::cuda::std::is_integral_v, "Offset iterator value type should be integral."); if constexpr (::cuda::std::is_integral_v) { @@ -942,11 +942,11 @@ public: { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceSegmentedReduce::Sum"); - using OffsetT = detail::common_iterator_value_t; - using OutputT = detail::non_void_value_t>; - using init_value_t = OutputT; - using op_t = ::cuda::std::plus<>; - using AccumT = ::cuda::std::__accumulator_t, init_value_t>; + using OffsetT = detail::common_iterator_value_t; + using OutputT = detail::non_void_value_t>; + using init_value_t = OutputT; + using op_t = ::cuda::std::plus<>; + using AccumT = ::cuda::std::__accumulator_t, init_value_t>; return variable_size_env_impl( d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, op_t{}, init_value_t{}, env); @@ -1018,7 +1018,15 @@ public: _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Sum"); using init_value_t = detail::non_void_value_t>; return fixed_size_impl( - d_temp_storage, temp_storage_bytes, d_in, d_out, num_segments, segment_size, ::cuda::std::plus{}, init_value_t{}, stream); + d_temp_storage, + temp_storage_bytes, + d_in, + d_out, + num_segments, + segment_size, + ::cuda::std::plus{}, + init_value_t{}, + stream); } //! @rst @@ -1192,9 +1200,9 @@ public: { _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Min"); - using OffsetT = detail::common_iterator_value_t; - using InputT = detail::it_value_t; - using init_value_t = InputT; + using OffsetT = detail::common_iterator_value_t; + using InputT = detail::it_value_t; + using init_value_t = InputT; static_assert(::cuda::std::numeric_limits::is_specialized, "numeric_limits must be specialized for the input value type"); static_assert(::cuda::std::is_integral_v, "Offset iterator value type should be integral."); @@ -1306,17 +1314,24 @@ public: { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceSegmentedReduce::Min"); - using OffsetT = detail::common_iterator_value_t; - using InputT = detail::it_value_t; - using init_value_t = InputT; - using op_t = ::cuda::minimum<>; - using AccumT = ::cuda::std::__accumulator_t, init_value_t>; + using OffsetT = detail::common_iterator_value_t; + using InputT = detail::it_value_t; + using init_value_t = InputT; + using op_t = ::cuda::minimum<>; + using AccumT = ::cuda::std::__accumulator_t, init_value_t>; static_assert(::cuda::std::numeric_limits::is_specialized, "numeric_limits must be specialized for the input value type"); return variable_size_env_impl( - d_in, d_out, num_segments, d_begin_offsets, d_end_offsets, op_t{}, ::cuda::std::numeric_limits::max(), env); + d_in, + d_out, + num_segments, + d_begin_offsets, + d_end_offsets, + op_t{}, + ::cuda::std::numeric_limits::max(), + env); } //! @rst @@ -1594,7 +1609,7 @@ public: using OutputKeyT = typename OutputTupleT::Key; using OutputValueT = typename OutputTupleT::Value; using OverrideAccumT = OutputTupleT; - using init_value_t = detail::reduce::empty_problem_init_t; + using init_value_t = detail::reduce::empty_problem_init_t; static_assert(::cuda::std::is_same_v, "Output key type must be int."); static_assert(::cuda::std::numeric_limits::is_specialized, @@ -1732,7 +1747,7 @@ public: using OutputKeyT = typename OutputTupleT::Key; using OutputValueT = typename OutputTupleT::Value; using OverrideAccumT = OutputTupleT; - using init_value_t = detail::reduce::empty_problem_init_t; + using init_value_t = detail::reduce::empty_problem_init_t; static_assert(::cuda::std::is_same_v, "Output key type must be int."); static_assert(::cuda::std::numeric_limits::is_specialized, @@ -1978,9 +1993,9 @@ public: { _CCCL_NVTX_RANGE_SCOPE_IF(d_temp_storage, "cub::DeviceSegmentedReduce::Max"); - using OffsetT = detail::common_iterator_value_t; - using InputT = cub::detail::it_value_t; - using init_value_t = InputT; + using OffsetT = detail::common_iterator_value_t; + using InputT = cub::detail::it_value_t; + using init_value_t = InputT; static_assert(::cuda::std::numeric_limits::is_specialized, "numeric_limits must be specialized for the input value type"); @@ -2093,11 +2108,11 @@ public: { _CCCL_NVTX_RANGE_SCOPE("cub::DeviceSegmentedReduce::Max"); - using OffsetT = detail::common_iterator_value_t; - using InputT = cub::detail::it_value_t; - using init_value_t = InputT; - using op_t = ::cuda::maximum<>; - using AccumT = ::cuda::std::__accumulator_t, init_value_t>; + using OffsetT = detail::common_iterator_value_t; + using InputT = cub::detail::it_value_t; + using init_value_t = InputT; + using op_t = ::cuda::maximum<>; + using AccumT = ::cuda::std::__accumulator_t, init_value_t>; static_assert(::cuda::std::numeric_limits::is_specialized, "numeric_limits must be specialized for the input value type"); @@ -2387,7 +2402,7 @@ public: using InputValueT = cub::detail::it_value_t; using OutputTupleT = cub::detail::non_void_value_t>; using OverrideAccumT = OutputTupleT; - using init_value_t = detail::reduce::empty_problem_init_t; + using init_value_t = detail::reduce::empty_problem_init_t; using OutputKeyT = typename OutputTupleT::Key; using OutputValueT = typename OutputTupleT::Value; @@ -2525,7 +2540,7 @@ public: using InputValueT = cub::detail::it_value_t; using OutputTupleT = cub::detail::non_void_value_t>; using OverrideAccumT = OutputTupleT; - using init_value_t = detail::reduce::empty_problem_init_t; + using init_value_t = detail::reduce::empty_problem_init_t; using OutputKeyT = typename OutputTupleT::Key; using OutputValueT = typename OutputTupleT::Value; diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index fb685b2c355..63c8f93ec36 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -140,24 +140,25 @@ struct policy_selector_from_hub * @tparam InitValueT * Initial value type */ -template >, - typename AccumT = ::cuda::std::__accumulator_t, InitValueT>, - typename TransformOpT = ::cuda::std::identity, - typename PolicyHub = detail::reduce::policy_hub, - typename KernelSource = detail::reduce::DeviceReduceKernelSource< - detail::reduce::policy_selector_from_hub, - InputIteratorT, - OutputIteratorT, - OffsetT, - ReductionOpT, - InitValueT, - AccumT, - TransformOpT>, - typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> +template < + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetT, + typename ReductionOpT, + typename InitValueT = cub::detail::non_void_value_t>, + typename AccumT = ::cuda::std::__accumulator_t, InitValueT>, + typename TransformOpT = ::cuda::std::identity, + typename PolicyHub = detail::reduce::policy_hub, + typename KernelSource = detail::reduce::DeviceReduceKernelSource< + detail::reduce::policy_selector_from_hub, + InputIteratorT, + OutputIteratorT, + OffsetT, + ReductionOpT, + InitValueT, + AccumT, + TransformOpT>, + typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> struct DispatchReduce { //--------------------------------------------------------------------------- @@ -719,20 +720,26 @@ template , int> = 0> _CCCL_HOST_DEVICE_API auto select_accum_t(OverrideAccumT*) -> OverrideAccumT; -template < - typename OverrideAccumT = use_default, - typename InputIteratorT, - typename OutputIteratorT, - typename OffsetT, - typename ReductionOpT, - typename InitValueT = non_void_value_t>, - typename TransformOpT = ::cuda::std::identity, - typename AccumT = - decltype(select_accum_t(static_cast(nullptr))), - typename PolicySelector = policy_selector_from_types, - typename KernelSource = - DeviceReduceKernelSource, - typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> +template >, + typename TransformOpT = ::cuda::std::identity, + typename AccumT = decltype(select_accum_t( + static_cast(nullptr))), + typename PolicySelector = policy_selector_from_types, + typename KernelSource = DeviceReduceKernelSource< + PolicySelector, + InputIteratorT, + OutputIteratorT, + OffsetT, + ReductionOpT, + InitValueT, + AccumT, + TransformOpT>, + typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> #if _CCCL_HAS_CONCEPTS() requires reduce_policy_selector #endif // _CCCL_HAS_CONCEPTS() diff --git a/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh b/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh index 3d43f5e7ae2..c891e5d3c7c 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_deterministic.cuh @@ -46,8 +46,8 @@ template using transformed_input_t = ::cuda::std::decay_t<::cuda::std::invoke_result_t>; template -using accum_t = - ::cuda::std::__accumulator_t<::cuda::std::plus<>, InitValueT, transformed_input_t>>; +using accum_t = ::cuda::std:: + __accumulator_t<::cuda::std::plus<>, InitValueT, transformed_input_t>>; template >* = nullptr> struct deterministic_sum_t @@ -333,7 +333,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( InputIteratorT d_in, OutputIteratorT d_out, OffsetT num_items, - InitValueT init = {}, + InitValueT init = {}, cudaStream_t stream = {}, TransformOpT transform_op = {}, PolicySelector policy_selector = {}, diff --git a/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh b/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh index 8c890f95edb..d099af50308 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_nondeterministic.cuh @@ -136,26 +136,27 @@ _CCCL_HOST_DEVICE_API auto select_nondeterministic_accum_t(OverrideAccumT*) -> O //! //! @param[in] stream //! CUDA stream to launch kernels within. Default is stream0. -template >, - typename TransformOpT = ::cuda::std::identity, - typename AccumT = decltype(select_nondeterministic_accum_t( - static_cast(nullptr))), - typename PolicySelector = policy_selector_from_types, - typename KernelSource = DeviceReduceNondeterministicKernelSource< - PolicySelector, - InputIteratorT, - OutputIteratorT, - OffsetT, - ReductionOpT, - InitValueT, - AccumT, - TransformOpT>, - typename KernelLauncherFactory = TripleChevronFactory> +template < + typename OverrideAccumT = nondeterministic_no_override, + typename InputIteratorT, + typename OutputIteratorT, + typename OffsetT, + typename ReductionOpT, + typename InitValueT = non_void_value_t>, + typename TransformOpT = ::cuda::std::identity, + typename AccumT = decltype(select_nondeterministic_accum_t( + static_cast(nullptr))), + typename PolicySelector = policy_selector_from_types, + typename KernelSource = DeviceReduceNondeterministicKernelSource< + PolicySelector, + InputIteratorT, + OutputIteratorT, + OffsetT, + ReductionOpT, + InitValueT, + AccumT, + TransformOpT>, + typename KernelLauncherFactory = TripleChevronFactory> #if _CCCL_HAS_CONCEPTS() requires reduce_nondeterministic_policy_selector #endif // _CCCL_HAS_CONCEPTS() diff --git a/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh b/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh index c4304d27d77..d81d540c442 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_reduce.cuh @@ -138,26 +138,27 @@ public: * @tparam InitValueT * value type */ -template >, - typename AccumT = ::cuda::std::__accumulator_t, InitValueT>, - typename PolicyHub = detail::segmented_reduce::policy_hub, - typename KernelSource = detail::segmented_reduce::DeviceSegmentedReduceKernelSource< - detail::segmented_reduce::policy_selector_from_hub, - InputIteratorT, - OutputIteratorT, - BeginOffsetIteratorT, - EndOffsetIteratorT, - OffsetT, - ReductionOpT, - InitValueT, - AccumT>, - typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> +template < + typename InputIteratorT, + typename OutputIteratorT, + typename BeginOffsetIteratorT, + typename EndOffsetIteratorT, + typename OffsetT, + typename ReductionOpT, + typename InitValueT = cub::detail::non_void_value_t>, + typename AccumT = ::cuda::std::__accumulator_t, InitValueT>, + typename PolicyHub = detail::segmented_reduce::policy_hub, + typename KernelSource = detail::segmented_reduce::DeviceSegmentedReduceKernelSource< + detail::segmented_reduce::policy_selector_from_hub, + InputIteratorT, + OutputIteratorT, + BeginOffsetIteratorT, + EndOffsetIteratorT, + OffsetT, + ReductionOpT, + InitValueT, + AccumT>, + typename KernelLauncherFactory = CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY> struct DispatchSegmentedReduce { //--------------------------------------------------------------------------- @@ -677,7 +678,14 @@ struct DeviceFixedSizeSegmentedReduceKernelSource CUB_DEFINE_KERNEL_GETTER( FixedSizeSegmentedReduceKernel, - DeviceFixedSizeSegmentedReduceKernel) + DeviceFixedSizeSegmentedReduceKernel< + PolicySelector, + InputIteratorT, + OutputIteratorT, + OffsetT, + ReductionOpT, + InitValueT, + AccumT>) CUB_DEFINE_KERNEL_GETTER( FixedSizeSegmentedReduceKernelFinal, @@ -694,7 +702,7 @@ template >, + typename InitValueT = non_void_value_t>, typename AccumT = decltype(select_segmented_accum_t( static_cast(nullptr))), typename PolicySelector = policy_selector_from_types, diff --git a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh index 2a7f561251d..c58d050b9fd 100644 --- a/cub/cub/device/dispatch/kernels/kernel_reduce.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_reduce.cuh @@ -50,7 +50,8 @@ _CCCL_HOST_DEVICE _CCCL_FORCEINLINE InitValueT unwrap_empty_problem_init(InitVal } template -_CCCL_HOST_DEVICE _CCCL_FORCEINLINE InitValueT unwrap_empty_problem_init(empty_problem_init_t empty_problem_init) +_CCCL_HOST_DEVICE _CCCL_FORCEINLINE InitValueT +unwrap_empty_problem_init(empty_problem_init_t empty_problem_init) { return empty_problem_init.init; } @@ -77,8 +78,8 @@ finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT reduction_op, I * @param block_aggregate Aggregate value computed by the block */ template -_CCCL_HOST_DEVICE void -finalize_and_store_aggregate(OutputIteratorT d_out, ReductionOpT, empty_problem_init_t, AccumT block_aggregate) +_CCCL_HOST_DEVICE void finalize_and_store_aggregate( + OutputIteratorT d_out, ReductionOpT, empty_problem_init_t, AccumT block_aggregate) { *d_out = block_aggregate; } diff --git a/cub/test/catch2_test_device_reduce.cu b/cub/test/catch2_test_device_reduce.cu index cde5f1990a4..85a5ca91615 100644 --- a/cub/test/catch2_test_device_reduce.cu +++ b/cub/test/catch2_test_device_reduce.cu @@ -146,8 +146,8 @@ C2H_TEST("Device reduce works with all device interfaces", "[reduce][device]", f // Run test c2h::device_vector out_result(num_segments); - auto d_out_it = thrust::raw_pointer_cast(out_result.data()); - using init_value_t = cub::detail::it_value_t; + auto d_out_it = thrust::raw_pointer_cast(out_result.data()); + using init_value_t = cub::detail::it_value_t; device_reduce(unwrap_it(d_in_it), unwrap_it(d_out_it), num_items, reduction_op, init_value_t{}); // Verify result diff --git a/cub/test/catch2_test_device_reduce_deterministic.cu b/cub/test/catch2_test_device_reduce_deterministic.cu index 7618a87a32c..2b0a4953b4f 100644 --- a/cub/test/catch2_test_device_reduce_deterministic.cu +++ b/cub/test/catch2_test_device_reduce_deterministic.cu @@ -306,7 +306,7 @@ C2H_TEST("Deterministic Device reduce works with integral types on gpu with diff "[reduce][deterministic]", test_types) { - using type = typename c2h::get<0, TestType>; + using type = typename c2h::get<0, TestType>; using init_value_t = type; const auto env = cuda::execution::require(cuda::execution::determinism::gpu_to_gpu); @@ -326,8 +326,8 @@ C2H_TEST("Deterministic Device reduce works with integral types on gpu with diff { c2h::device_vector d_output(1); - auto error = - cub::DeviceReduce::Reduce(d_input.begin(), d_output.begin(), num_items, cuda::std::plus{}, init_value_t{}, env); + auto error = cub::DeviceReduce::Reduce( + d_input.begin(), d_output.begin(), num_items, cuda::std::plus{}, init_value_t{}, env); REQUIRE(error == cudaSuccess); c2h::host_vector h_input = d_input; @@ -385,8 +385,8 @@ C2H_TEST("Deterministic Device reduce works with integral types on gpu with diff init_value_t init_value{cuda::std::numeric_limits::max()}; - auto error = - cub::DeviceReduce::Reduce(d_input.begin(), d_output.begin(), num_items, cuda::minimum{}, init_value, env); + auto error = cub::DeviceReduce::Reduce( + d_input.begin(), d_output.begin(), num_items, cuda::minimum{}, init_value, env); REQUIRE(error == cudaSuccess); c2h::host_vector h_input = d_input; @@ -403,8 +403,8 @@ C2H_TEST("Deterministic Device reduce works with integral types on gpu with diff init_value_t init_value{cuda::std::numeric_limits::min()}; - auto error = - cub::DeviceReduce::Reduce(d_input.begin(), d_output.begin(), num_items, cuda::maximum{}, init_value, env); + auto error = cub::DeviceReduce::Reduce( + d_input.begin(), d_output.begin(), num_items, cuda::maximum{}, init_value, env); REQUIRE(error == cudaSuccess); c2h::host_vector h_input = d_input; diff --git a/cub/test/catch2_test_device_reduce_dispatcher.cu b/cub/test/catch2_test_device_reduce_dispatcher.cu index ec1270c708c..0a039f316f0 100644 --- a/cub/test/catch2_test_device_reduce_dispatcher.cu +++ b/cub/test/catch2_test_device_reduce_dispatcher.cu @@ -41,11 +41,11 @@ struct policy_hub_t C2H_TEST("Dispatch reduce can be called with custom policy_hub", "[reduce][device]", value_types) { - using T = c2h::get<0, TestType>; - using offset_t = int32_t; - using init_value_t = T; - using op_t = cuda::std::plus<>; - using accum_t = cuda::std::__accumulator_t; + using T = c2h::get<0, TestType>; + using offset_t = int32_t; + using init_value_t = T; + using op_t = cuda::std::plus<>; + using accum_t = cuda::std::__accumulator_t; const int num_items = 12'345; diff --git a/cub/test/catch2_test_device_reduce_env.cu b/cub/test/catch2_test_device_reduce_env.cu index 00bca4a3a1f..e142a56dfde 100644 --- a/cub/test/catch2_test_device_reduce_env.cu +++ b/cub/test/catch2_test_device_reduce_env.cu @@ -401,7 +401,7 @@ C2H_TEST("Device reduce uses environment", "[reduce][device]", requirements) using num_items_t = int; using offset_t = cub::detail::choose_offset_t; using transform_t = cuda::std::identity; - using init_value_t = accumulator_t; + using init_value_t = accumulator_t; num_items_t num_items = GENERATE(1 << 4, 1 << 24); auto d_in = cuda::constant_iterator(1.0f); @@ -529,7 +529,7 @@ C2H_TEST("Device sum uses environment", "[reduce][device]", requirements) using num_items_t = int; using offset_t = cub::detail::choose_offset_t; using transform_t = cuda::std::identity; - using init_value_t = accumulator_t; + using init_value_t = accumulator_t; num_items_t num_items = GENERATE(1 << 4, 1 << 24); auto d_in = cuda::constant_iterator(1.0f); @@ -653,7 +653,7 @@ C2H_TEST("Device reduce not_guaranteed falls back when output type differs from using output_t = cuda::std::uint8_t; using accumulator_t = int; using op_t = cuda::std::plus<>; - using init_value_t = input_t; + using init_value_t = input_t; using num_items_t = int; using offset_t = cub::detail::choose_offset_t; using transform_t = cuda::std::identity; @@ -708,7 +708,7 @@ C2H_TEST("Device sum not_guaranteed falls back when output type differs from acc using output_t = cuda::std::uint8_t; using accumulator_t = int; using op_t = cuda::std::plus<>; - using init_value_t = output_t; + using init_value_t = output_t; using num_items_t = int; using offset_t = cub::detail::choose_offset_t; using transform_t = cuda::std::identity; diff --git a/cub/test/catch2_test_device_reduce_iterators.cu b/cub/test/catch2_test_device_reduce_iterators.cu index ed494c52e8d..7a9282bd9d8 100644 --- a/cub/test/catch2_test_device_reduce_iterators.cu +++ b/cub/test/catch2_test_device_reduce_iterators.cu @@ -71,7 +71,7 @@ C2H_TEST("Device reduce works with fancy input iterators", "[reduce][device]", i init_default_constant(default_constant); auto in_it = cuda::constant_iterator(default_constant); - using op_t = cuda::std::plus<>; + using op_t = cuda::std::plus<>; using init_value_t = output_t; // Binary reduction operator @@ -110,7 +110,7 @@ C2H_TEST("Device reduce compiles with discard output iterator", "[reduce][device init_default_constant(default_constant); auto in_it = cuda::constant_iterator(default_constant); - using op_t = cuda::std::plus<>; + using op_t = cuda::std::plus<>; using init_value_t = output_t; // Binary reduction operator diff --git a/cub/test/catch2_test_device_reduce_nondeterministic.cu b/cub/test/catch2_test_device_reduce_nondeterministic.cu index a56dcee1cd7..76948daa3d4 100644 --- a/cub/test/catch2_test_device_reduce_nondeterministic.cu +++ b/cub/test/catch2_test_device_reduce_nondeterministic.cu @@ -217,15 +217,23 @@ C2H_TEST("Nondeterministic Device reduce works with float and double on gpu with auto* raw_ptr = thrust::raw_pointer_cast(d_output.data()); - using output_it_t = decltype(raw_ptr); - using init_value_t = type; - using accum_t = type; - using transform_t = square_t; + using output_it_t = decltype(raw_ptr); + using init_value_t = type; + using accum_t = type; + using transform_t = square_t; std::size_t temp_storage_bytes{}; auto error = cub::detail::reduce_nondeterministic::dispatch( - nullptr, temp_storage_bytes, input, raw_ptr, num_items, cuda::std::plus{}, init_value_t{}, nullptr, transform_t{}); + nullptr, + temp_storage_bytes, + input, + raw_ptr, + num_items, + cuda::std::plus{}, + init_value_t{}, + nullptr, + transform_t{}); REQUIRE(error == cudaSuccess); c2h::device_vector temp_storage(temp_storage_bytes, thrust::no_init); diff --git a/cub/test/catch2_test_device_scan.cu b/cub/test/catch2_test_device_scan.cu index 795bd0ca6e8..fcac927cebc 100644 --- a/cub/test/catch2_test_device_scan.cu +++ b/cub/test/catch2_test_device_scan.cu @@ -261,8 +261,8 @@ C2H_TEST("Device scan works with all device interfaces", "[scan][device]", full_ // Run test c2h::device_vector out_result(num_items); - auto d_out_it = thrust::raw_pointer_cast(out_result.data()); - using init_value_t = cub::detail::it_value_t; + auto d_out_it = thrust::raw_pointer_cast(out_result.data()); + using init_value_t = cub::detail::it_value_t; device_exclusive_scan(unwrap_it(d_in_it), unwrap_it(d_out_it), scan_op, init_value_t{}, num_items); // Verify result @@ -297,8 +297,8 @@ C2H_TEST("Device scan works with all device interfaces", "[scan][device]", full_ // Run test c2h::device_vector out_result(num_items); - auto d_out_it = thrust::raw_pointer_cast(out_result.data()); - using init_value_t = cub::detail::it_value_t; + auto d_out_it = thrust::raw_pointer_cast(out_result.data()); + using init_value_t = cub::detail::it_value_t; c2h::device_vector d_initial_value(1); d_initial_value[0] = static_cast(*unwrap_it(&init_value)); auto future_init_value = cub::FutureValue(thrust::raw_pointer_cast(d_initial_value.data())); diff --git a/cub/test/catch2_test_device_scan_by_key.cu b/cub/test/catch2_test_device_scan_by_key.cu index bbc25d518f2..1d3dfdf9b7a 100644 --- a/cub/test/catch2_test_device_scan_by_key.cu +++ b/cub/test/catch2_test_device_scan_by_key.cu @@ -219,7 +219,7 @@ C2H_TEST("Device scan works with all device interfaces", "[by_key][scan][device] // Run test c2h::device_vector out_values(num_items); auto d_values_out_it = thrust::raw_pointer_cast(out_values.data()); - using init_value_t = cub::detail::it_value_t; + using init_value_t = cub::detail::it_value_t; device_exclusive_scan_by_key( d_keys_it, unwrap_it(d_values_it), unwrap_it(d_values_out_it), scan_op, init_value_t{}, num_items, eq_op_t{}); @@ -234,7 +234,13 @@ C2H_TEST("Device scan works with all device interfaces", "[by_key][scan][device] out_values = in_values; auto values_in_out_it = thrust::raw_pointer_cast(out_values.data()); device_exclusive_scan_by_key( - d_keys_it, unwrap_it(values_in_out_it), unwrap_it(values_in_out_it), scan_op, init_value_t{}, num_items, eq_op_t{}); + d_keys_it, + unwrap_it(values_in_out_it), + unwrap_it(values_in_out_it), + scan_op, + init_value_t{}, + num_items, + eq_op_t{}); // Verify result REQUIRE(expected_result == out_values); @@ -358,7 +364,7 @@ C2H_TEST("Device scan works when memory for keys and results alias one another", // Run test auto d_values_out_it = d_keys_it; - using init_value_t = value_t; + using init_value_t = value_t; device_exclusive_scan_by_key( d_keys_it, d_values_it, d_values_out_it, scan_op, init_value_t{}, num_items, cuda::std::equal_to<>{}); diff --git a/cub/test/catch2_test_device_scan_by_key_iterators.cu b/cub/test/catch2_test_device_scan_by_key_iterators.cu index adc848cef3c..48253b97d1a 100644 --- a/cub/test/catch2_test_device_scan_by_key_iterators.cu +++ b/cub/test/catch2_test_device_scan_by_key_iterators.cu @@ -174,7 +174,8 @@ C2H_TEST("Device scan works with fancy iterators", "[by_key][scan][device]", ful // Run test c2h::device_vector out_values(num_items); using init_value_t = value_t; - device_exclusive_scan_by_key(d_keys_it, values_in_it, out_values.begin(), scan_op, init_value_t{}, num_items, eq_op_t{}); + device_exclusive_scan_by_key( + d_keys_it, values_in_it, out_values.begin(), scan_op, init_value_t{}, num_items, eq_op_t{}); // Verify result REQUIRE(expected_result == out_values); diff --git a/cub/test/catch2_test_device_scan_invalid.cu b/cub/test/catch2_test_device_scan_invalid.cu index 2b75eddc655..ba8f9ef9c43 100644 --- a/cub/test/catch2_test_device_scan_invalid.cu +++ b/cub/test/catch2_test_device_scan_invalid.cu @@ -260,7 +260,7 @@ C2H_TEST("Device scan avoids invalid data with all device interfaces", "[scan][d // Run test c2h::device_vector out_result(num_items); const auto d_out_it = thrust::raw_pointer_cast(out_result.data()); - using init_value_t = output_t; + using init_value_t = output_t; c2h::device_vector d_initial_value{init_value}; const auto future_init_value = cub::FutureValue(thrust::raw_pointer_cast(d_initial_value.data())); device_exclusive_scan(d_in_it, d_out_it, scan_op, future_init_value, num_items); diff --git a/cub/test/catch2_test_device_scan_iterators.cu b/cub/test/catch2_test_device_scan_iterators.cu index a4b02560cab..a48bd4466bb 100644 --- a/cub/test/catch2_test_device_scan_iterators.cu +++ b/cub/test/catch2_test_device_scan_iterators.cu @@ -142,8 +142,8 @@ C2H_TEST("Device scan works with iterators", "[scan][device]", iterator_type_lis // Run test c2h::device_vector out_result(num_items); - auto d_out_it = thrust::raw_pointer_cast(out_result.data()); - using init_value_t = cub::detail::it_value_t; + auto d_out_it = thrust::raw_pointer_cast(out_result.data()); + using init_value_t = cub::detail::it_value_t; c2h::device_vector d_initial_value(1); d_initial_value[0] = static_cast(init_value); auto future_init_value = cub::FutureValue(thrust::raw_pointer_cast(d_initial_value.data())); diff --git a/cub/test/catch2_test_device_segmented_reduce.cu b/cub/test/catch2_test_device_segmented_reduce.cu index 7650056a139..91fc5910185 100644 --- a/cub/test/catch2_test_device_segmented_reduce.cu +++ b/cub/test/catch2_test_device_segmented_reduce.cu @@ -114,10 +114,16 @@ C2H_TEST("Device reduce works with all device interfaces", "[segmented][reduce][ // Run test c2h::device_vector out_result(num_segments); - auto d_out_it = thrust::raw_pointer_cast(out_result.data()); - using init_value_t = cub::detail::it_value_t; + auto d_out_it = thrust::raw_pointer_cast(out_result.data()); + using init_value_t = cub::detail::it_value_t; device_segmented_reduce( - unwrap_it(d_in_it), unwrap_it(d_out_it), num_segments, d_offsets_it, d_offsets_it + 1, reduction_op, init_value_t{}); + unwrap_it(d_in_it), + unwrap_it(d_out_it), + num_segments, + d_offsets_it, + d_offsets_it + 1, + reduction_op, + init_value_t{}); // Verify result REQUIRE(expected_result == out_result); diff --git a/cub/test/catch2_test_device_segmented_reduce_iterators.cu b/cub/test/catch2_test_device_segmented_reduce_iterators.cu index b177d04f7a2..dc867bd6ce6 100644 --- a/cub/test/catch2_test_device_segmented_reduce_iterators.cu +++ b/cub/test/catch2_test_device_segmented_reduce_iterators.cu @@ -59,7 +59,7 @@ C2H_TEST("Device segmented reduce works with fancy input iterators", "[reduce][d init_default_constant(default_constant); auto in_it = cuda::constant_iterator(default_constant); - using op_t = cuda::std::plus<>; + using op_t = cuda::std::plus<>; using init_value_t = output_t; // Binary reduction operator diff --git a/cub/test/catch2_test_device_segmented_reduce_max_seg_size.cu b/cub/test/catch2_test_device_segmented_reduce_max_seg_size.cu index 67565208f1f..3832ddce132 100644 --- a/cub/test/catch2_test_device_segmented_reduce_max_seg_size.cu +++ b/cub/test/catch2_test_device_segmented_reduce_max_seg_size.cu @@ -27,9 +27,9 @@ C2H_TEST("Device segmented reduce works with dynamic max segment sizes", using output_t = input_t; using offset_t = typename c2h::get<1, TestType>; - using op_t = cuda::std::plus<>; - using accum_t = cuda::std::__accumulator_t; - using init_value_t = input_t; + using op_t = cuda::std::plus<>; + using accum_t = cuda::std::__accumulator_t; + using init_value_t = input_t; cuda::compute_capability cc{}; REQUIRE(cudaSuccess == cub::detail::ptx_compute_cap(cc)); diff --git a/cub/test/catch2_test_device_transform_reduce.cu b/cub/test/catch2_test_device_transform_reduce.cu index 1609853c88c..a17c5577078 100644 --- a/cub/test/catch2_test_device_transform_reduce.cu +++ b/cub/test/catch2_test_device_transform_reduce.cu @@ -31,7 +31,7 @@ struct square_t C2H_TEST("Device transform reduce works with pointers", "[reduce][device]", types) { using item_t = c2h::get<0, TestType>; - using init_value_t = item_t; + using init_value_t = item_t; using offset_t = std::int32_t; using reduction_op_t = cuda::std::plus<>; using transform_op_t = square_t; @@ -78,7 +78,7 @@ C2H_TEST("Device transform reduce works with pointers", "[reduce][device]", type C2H_TEST("Device transform reduce works with iterators", "[reduce][device]", types) { using item_t = c2h::get<0, TestType>; - using init_value_t = item_t; + using init_value_t = item_t; using offset_t = std::int32_t; using reduction_op_t = cuda::std::plus<>; using transform_op_t = square_t; From ecf7187e6d8fca4380ee0afb763a5dfa397d11f7 Mon Sep 17 00:00:00 2001 From: gonidelis Date: Fri, 5 Jun 2026 11:07:35 -0700 Subject: [PATCH 3/3] Remove stale InitValueT from docs - not used --- cub/cub/device/dispatch/dispatch_streaming_reduce.cuh | 3 --- 1 file changed, 3 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh index 47d089b665f..176bf7fdfc0 100644 --- a/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_streaming_reduce.cuh @@ -152,9 +152,6 @@ struct unzip_and_write_arg_extremum_op // The streaming reduction requires two overloads, one used for selecting the extremum within one partition and one // for selecting the extremum across partitions. // -// @tparam InitValueT -// Initial value type -// // @tparam PolicySelector // Selects the tuning policy template