Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions cub/benchmarks/bench/segmented_topk/fixed/keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include <cub/detail/choose_offset.cuh>
#include <cub/device/dispatch/dispatch_batched_topk.cuh>

#include <cuda/__argument_>
#include <cuda/argument>
#include <cuda/iterator>

#include <nvbench_helper.cuh>
Expand Down Expand Up @@ -51,7 +51,7 @@ void fixed_seg_size_topk_keys(
const auto selected_elements = static_cast<::cuda::std::ptrdiff_t>(MaxNumSelected);
const auto num_segments = ::cuda::std::max<std::size_t>(1, (max_elements / segment_size));
const auto elements = num_segments * segment_size;
const auto total_num_items = ::cuda::__argument::__immediate{static_cast<::cuda::std::int64_t>(elements)};
const auto total_num_items = ::cuda::args::immediate{static_cast<::cuda::std::int64_t>(elements)};
const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));

// Skip workloads where k exceeds the segment size
Expand All @@ -68,9 +68,9 @@ void fixed_seg_size_topk_keys(
auto d_keys_in = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_in_ptr), segment_size);
auto d_keys_out = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_out_ptr), selected_elements);

auto segment_sizes = ::cuda::__argument::__constant<MaxSegmentSize>{};
auto k = ::cuda::__argument::__constant<MaxNumSelected>{};
auto select_direction = ::cuda::__argument::__constant<cub::detail::topk::select::max>{};
auto segment_sizes = ::cuda::args::constant<MaxSegmentSize>{};
auto k = ::cuda::args::constant<MaxNumSelected>{};
auto select_direction = ::cuda::args::constant<cub::detail::topk::select::max>{};

state.add_element_count(elements, "NumElements");
state.add_element_count(segment_size, "SegmentSize");
Expand Down Expand Up @@ -99,7 +99,7 @@ void fixed_seg_size_topk_keys(
segment_sizes,
k,
select_direction,
::cuda::__argument::__immediate{static_cast<::cuda::std::int64_t>(num_segments)},
::cuda::args::immediate{static_cast<::cuda::std::int64_t>(num_segments)},
total_num_items,
env);
});
Expand Down
14 changes: 7 additions & 7 deletions cub/benchmarks/bench/segmented_topk/variable/keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include <thrust/reduce.h>
#include <thrust/tabulate.h>

#include <cuda/__argument_>
#include <cuda/argument>
#include <cuda/iterator>
#include <cuda/random>
#include <cuda/std/algorithm>
Expand Down Expand Up @@ -172,17 +172,17 @@ void variable_seg_size_topk_keys(nvbench::state& state,
static_cast<cuda::std::int64_t>(MaxSegmentSize));
const auto input_elements = thrust::reduce(d_segment_sizes.begin(), d_segment_sizes.end());
const auto output_elements = static_cast<std::size_t>(num_segments) * K;
const auto total_num_items = ::cuda::__argument::__immediate{static_cast<cuda::std::int64_t>(input_elements)};
const auto total_num_items = ::cuda::args::immediate{static_cast<cuda::std::int64_t>(input_elements)};

auto in_keys_buffer = gen_data<MaxSegmentSize, K>(
num_segments, string_to_pattern(state.get_string("Pattern")), thrust::raw_pointer_cast(d_segment_sizes.data()));
auto out_keys_buffer = thrust::device_vector<KeyT>(output_elements, thrust::no_init);

auto segment_sizes_param = ::cuda::__argument::__immediate_sequence{
thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::__argument::__bounds<1, MaxSegmentSize>()};
auto k_param = ::cuda::__argument::__constant<K>{};
auto select_direction = ::cuda::__argument::__constant<cub::detail::topk::select::max>{};
auto num_segments_param = ::cuda::__argument::__immediate{static_cast<cuda::std::int64_t>(num_segments)};
auto segment_sizes_param = ::cuda::args::__immediate_sequence{
thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::args::bounds<1, MaxSegmentSize>()};
auto k_param = ::cuda::args::constant<K>{};
auto select_direction = ::cuda::args::constant<cub::detail::topk::select::max>{};
auto num_segments_param = ::cuda::args::immediate{static_cast<cuda::std::int64_t>(num_segments)};

auto d_keys_in = cuda::make_strided_iterator(
cuda::make_counting_iterator(thrust::raw_pointer_cast(in_keys_buffer.data())),
Expand Down
12 changes: 6 additions & 6 deletions cub/cub/agent/agent_batched_topk.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,8 @@
#include <cub/device/dispatch/tuning/tuning_batched_topk.cuh>
#include <cub/util_type.cuh>

#include <cuda/__argument_>
#include <cuda/__cmath/ceil_div.h>
#include <cuda/argument>

CUB_NAMESPACE_BEGIN

Expand Down Expand Up @@ -73,8 +73,8 @@ struct agent_batched_topk_worker_per_segment
using key_t = it_value_t<key_it_t>;
using value_t = it_value_t<value_it_t>;

using segment_size_val_t = typename ::cuda::__argument::__traits<SegmentSizeParameterT>::element_type;
using num_segments_val_t = typename ::cuda::__argument::__traits<NumSegmentsParameterT>::element_type;
using segment_size_val_t = typename ::cuda::args::__traits<SegmentSizeParameterT>::element_type;
using num_segments_val_t = typename ::cuda::args::__traits<NumSegmentsParameterT>::element_type;
using counters_t = batched_topk_counters<num_segments_val_t>;

static constexpr auto policy = PolicyGetter{}();
Expand All @@ -95,7 +95,7 @@ struct agent_batched_topk_worker_per_segment
multi_worker_per_segment_policy.threads_per_block * multi_worker_per_segment_policy.items_per_thread;

// Check if there could be large segments present
static constexpr bool only_small_segments = ::cuda::__argument::__traits<SegmentSizeParameterT>::highest <= tile_size;
static constexpr bool only_small_segments = ::cuda::args::__traits<SegmentSizeParameterT>::highest <= tile_size;

// Check if we are dealing with keys-only or key-value pairs
static constexpr bool is_keys_only = ::cuda::std::is_same_v<value_t, cub::NullType>;
Expand Down Expand Up @@ -196,8 +196,8 @@ struct agent_batched_topk_worker_per_segment
return;
}

constexpr bool is_full_tile = ::cuda::__argument::__traits<SegmentSizeParameterT>::is_constant
&& ::cuda::__argument::__traits<SegmentSizeParameterT>::lowest == tile_size;
constexpr bool is_full_tile = ::cuda::args::__traits<SegmentSizeParameterT>::is_constant
&& ::cuda::args::__traits<SegmentSizeParameterT>::lowest == tile_size;

// Resolve Segment Parameters
const auto segment_size = params::get_param(segment_sizes, segment_id);
Expand Down
36 changes: 18 additions & 18 deletions cub/cub/detail/segmented_params.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# pragma system_header
#endif // no system header

#include <cuda/__argument_>
#include <cuda/argument>
#include <cuda/std/__type_traits/integral_constant.h>
#include <cuda/std/__type_traits/remove_cvref.h>
#include <cuda/std/__utility/forward.h>
Expand All @@ -33,10 +33,10 @@ namespace detail::params
//! @param[in] __index Segment index to read for sequence arguments.
//! @return The single argument value, or the sequence element at the given index.
_CCCL_TEMPLATE(class _Tp, class _SegmentIndexT)
_CCCL_REQUIRES((!::cuda::__argument::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) )
_CCCL_REQUIRES((!::cuda::args::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) )
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(_Tp&& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
{
if constexpr (::cuda::__argument::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value)
if constexpr (::cuda::args::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value)
{
return __arg;
}
Expand All @@ -46,46 +46,46 @@ _CCCL_REQUIRES((!::cuda::__argument::__is_wrapper_v<::cuda::std::remove_cvref_t<
}
}

template <auto _Value, class _SegmentIndexT>
template <auto _Value, class _Tp, class _SegmentIndexT>
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
get_param(const ::cuda::__argument::__constant<_Value>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
get_param(const ::cuda::args::constant<_Value, _Tp>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
{
return ::cuda::__argument::__unwrap(__arg);
return ::cuda::args::__unwrap(__arg);
}

template <auto _Value, class _SegmentIndexT>
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
get_param(const ::cuda::__argument::__constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept
get_param(const ::cuda::args::__constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept
{
return ::cuda::__argument::__unwrap(__arg)[__index];
return ::cuda::args::__unwrap(__arg)[__index];
}

template <class _Arg, class _StaticBounds, class _SegmentIndexT>
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(
const ::cuda::__argument::__immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
get_param(const ::cuda::args::immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
{
return ::cuda::__argument::__unwrap(__arg);
return ::cuda::args::__unwrap(__arg);
}

template <class _Arg, class _StaticBounds, class _SegmentIndexT>
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
get_param(const ::cuda::__argument::__immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
get_param(const ::cuda::args::__immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
{
return ::cuda::__argument::__unwrap(__arg)[__index];
return ::cuda::args::__unwrap(__arg)[__index];
}

template <class _Arg, class _StaticBounds, class _SegmentIndexT>
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(
const ::cuda::__argument::__deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
get_param(const ::cuda::args::deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
{
return ::cuda::__argument::__unwrap(__arg);
return ::cuda::args::__unwrap(__arg);
}

template <class _Arg, class _StaticBounds, class _SegmentIndexT>
[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
get_param(const ::cuda::__argument::__deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
get_param(const ::cuda::args::deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
{
return ::cuda::__argument::__unwrap(__arg)[__index];
return ::cuda::args::__unwrap(__arg)[__index];
}

// =====================================================================
Expand Down
22 changes: 11 additions & 11 deletions cub/cub/device/dispatch/dispatch_batched_topk.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,10 @@

#include <thrust/system/cuda/detail/core/triple_chevron_launch.h>

#include <cuda/__argument_>
#include <cuda/__cmath/ceil_div.h>
#include <cuda/__iterator/counting_iterator.h>
#include <cuda/__iterator/transform_iterator.h>
#include <cuda/argument>
#include <cuda/std/__functional/operations.h>
#include <cuda/std/__type_traits/is_same.h>
#include <cuda/std/__type_traits/remove_cv.h>
Expand All @@ -49,9 +49,9 @@ namespace detail::batched_topk
// Internal: wrap user-facing select direction into discrete param for dispatch
// -----------------------------------------------------------------------------

// Uniform (compile-time): __constant<Dir> -> single-option uniform_discrete_param.
template <detail::topk::select Dir>
[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::__argument::__constant<Dir>)
// Uniform (compile-time): constant<Dir> -> single-option uniform_discrete_param.
template <detail::topk::select Dir, class _Tp>
[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::args::constant<Dir, _Tp>)
{
return params::uniform_discrete_param<detail::topk::select, Dir>{Dir};
}
Expand Down Expand Up @@ -126,7 +126,7 @@ template <typename KeyInputItItT,
typename PolicySelector = policy_selector_from_types<it_value_t<it_value_t<KeyInputItItT>>,
it_value_t<it_value_t<ValueInputItItT>>,
::cuda::std::int64_t,
::cuda::__argument::__traits<KParameterT>::highest>>
::cuda::args::__traits<KParameterT>::highest>>
#if _CCCL_HAS_CONCEPTS()
requires batched_topk_policy_selector<PolicySelector>
#endif // _CCCL_HAS_CONCEPTS()
Expand All @@ -145,7 +145,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
cudaStream_t stream = nullptr,
[[maybe_unused]] PolicySelector policy_selector = {})
{
using large_segment_tile_offset_t = typename ::cuda::__argument::__traits<TotalNumItemsGuaranteeT>::element_type;
using large_segment_tile_offset_t = typename ::cuda::args::__traits<TotalNumItemsGuaranteeT>::element_type;

// Wrap the raw enum into the internal discrete param type
auto select_directions = wrap_select_direction(select_direction);
Expand All @@ -171,9 +171,9 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
static constexpr int worker_per_segment_tile_size =
worker_per_segment_policy.threads_per_block * worker_per_segment_policy.items_per_thread;
static constexpr bool any_small_segments =
::cuda::__argument::__traits<SegmentSizeParameterT>::lowest <= worker_per_segment_tile_size;
::cuda::args::__traits<SegmentSizeParameterT>::lowest <= worker_per_segment_tile_size;
static constexpr bool only_small_segments =
::cuda::__argument::__traits<SegmentSizeParameterT>::highest <= worker_per_segment_tile_size;
::cuda::args::__traits<SegmentSizeParameterT>::highest <= worker_per_segment_tile_size;

// Allocation layout:
// only_small_segments: [0] dummy.
Expand All @@ -183,7 +183,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
static constexpr int allocations_array_size = only_small_segments ? 1 : (any_small_segments ? 3 : 2);
size_t allocation_sizes[allocations_array_size] = {1};

using num_segments_val_t = typename ::cuda::__argument::__traits<NumSegmentsParameterT>::element_type;
using num_segments_val_t = typename ::cuda::args::__traits<NumSegmentsParameterT>::element_type;
using counters_t = batched_topk_counters<num_segments_val_t>;
using segment_size_scan_offset_t = detail::choose_offset_t<num_segments_val_t>;
using segment_size_scan_input_op_t =
Expand Down Expand Up @@ -239,7 +239,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(

// TODO (elstehle): support number of segments provided by device-accessible iterator
// Only uniform number of segments are supported (i.e., we need to resolve the number of segments on the host)
static_assert(::cuda::__argument::__traits<NumSegmentsParameterT>::is_single_value,
static_assert(::cuda::args::__traits<NumSegmentsParameterT>::is_single_value,
"Only uniform segment sizes are currently supported.");
Comment on lines +242 to 243
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor | ⚡ Quick win

suggestion: Fix the static_assert message to match the checked condition.
Line 243 says “Only uniform segment sizes are currently supported.” but Line 242 checks NumSegmentsParameterT (uniform number of segments). Update the message to avoid misleading compile-time diagnostics.


if constexpr (any_small_segments)
Expand Down Expand Up @@ -341,7 +341,7 @@ template <typename KeyInputItItT,
policy_selector_from_types<it_value_t<it_value_t<KeyInputItItT>>,
it_value_t<it_value_t<ValueInputItItT>>,
::cuda::std::int64_t,
::cuda::__argument::__traits<KParameterT>::highest>;
::cuda::args::__traits<KParameterT>::highest>;
return detail::dispatch_with_env_and_tuning<default_policy_selector>(
env, [&](auto policy_selector, void* d_temp_storage, size_t& temp_storage_bytes, cudaStream_t stream) {
return dispatch(
Expand Down
10 changes: 5 additions & 5 deletions cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
#include <cub/device/dispatch/tuning/tuning_batched_topk.cuh>
#include <cub/util_arch.cuh>

#include <cuda/__argument_>
#include <cuda/__device/compute_capability.h>
#include <cuda/argument>

CUB_NAMESPACE_BEGIN

Expand All @@ -39,7 +39,7 @@ private:
worker_policy worker_per_segment_policy;
multi_worker_policy multi_worker_per_segment_policy;
};
static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::__argument::__traits<SegmentSizeParameterT>::highest;
static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::args::__traits<SegmentSizeParameterT>::highest;
static constexpr batched_topk_policy active_policy = current_policy<PolicySelector>();

template <int Index>
Expand Down Expand Up @@ -133,8 +133,8 @@ __launch_bounds__(int(
KParameterT k,
SelectDirectionParameterT select_directions,
NumSegmentsParameterT num_segments,
batched_topk_counters<typename ::cuda::__argument::__traits<NumSegmentsParameterT>::element_type>* d_counters,
typename ::cuda::__argument::__traits<NumSegmentsParameterT>::element_type* d_large_segments_ids,
batched_topk_counters<typename ::cuda::args::__traits<NumSegmentsParameterT>::element_type>* d_counters,
typename ::cuda::args::__traits<NumSegmentsParameterT>::element_type* d_large_segments_ids,
LargeSegmentTileOffsetT* d_large_segments_tile_offsets)
{
using agent_t = typename find_smallest_covering_policy<
Expand All @@ -151,7 +151,7 @@ __launch_bounds__(int(
LargeSegmentTileOffsetT>::agent_t;

// Static Assertions (Constraints)
static_assert(agent_t::tile_size >= ::cuda::__argument::__traits<SegmentSizeParameterT>::highest,
static_assert(agent_t::tile_size >= ::cuda::args::__traits<SegmentSizeParameterT>::highest,
"Block size exceeds maximum segment size supported by SegmentSizeParameterT");
static_assert(sizeof(typename agent_t::TempStorage) <= max_smem_per_block,
"Static shared memory per block must not exceed 48KB limit.");
Expand Down
27 changes: 13 additions & 14 deletions cub/test/catch2_test_device_segmented_topk_keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -151,11 +151,11 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small fixed-size segments",
batched_topk_keys(
d_keys_in,
d_keys_out,
::cuda::__argument::__immediate{segment_size, ::cuda::__argument::__bounds<segment_size_t{1}, max_segment_size>()},
::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds<segment_size_t{1}, static_max_k>()},
::cuda::args::immediate{segment_size, ::cuda::args::bounds<segment_size_t{1}, max_segment_size>()},
::cuda::args::immediate{k, ::cuda::args::bounds<segment_size_t{1}, static_max_k>()},
direction,
::cuda::__argument::__immediate{num_segments},
::cuda::__argument::__immediate{num_segments * segment_size});
::cuda::args::immediate{num_segments},
::cuda::args::immediate{num_segments * segment_size});
// Prepare expected results
fixed_size_segmented_sort_keys(expected_keys, num_segments, segment_size, direction);
compact_sorted_keys_to_topk(expected_keys, segment_size, k);
Expand Down Expand Up @@ -248,12 +248,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small variable-size segment
batched_topk_keys(
d_keys_in,
d_keys_out,
::cuda::__argument::__immediate_sequence{
segment_size_it, ::cuda::__argument::__bounds<segment_size_t{1}, static_max_segment_size>()},
::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds<segment_size_t{1}, static_max_k>()},
::cuda::args::__immediate_sequence{
segment_size_it, ::cuda::args::bounds<segment_size_t{1}, static_max_segment_size>()},
::cuda::args::immediate{k, ::cuda::args::bounds<segment_size_t{1}, static_max_k>()},
direction,
::cuda::__argument::__immediate{num_segments},
::cuda::__argument::__immediate{num_items});
::cuda::args::immediate{num_segments},
::cuda::args::immediate{num_items});

// Verify keys are returned correctly: sort each segment of the expected input, then compact the top-k
segmented_sort_keys(expected_keys, num_segments, segment_offsets.cbegin(), segment_offsets.cbegin() + 1, direction);
Expand Down Expand Up @@ -286,12 +286,11 @@ C2H_TEST("DeviceBatchedTopK::MinKeys preserves -0.0f in output", "[keys][segment
batched_topk_keys(
d_keys_in_it,
d_keys_out_it,
::cuda::__argument::__immediate{
segment_size, ::cuda::__argument::__bounds<cuda::std::int64_t{1}, max_segment_size>()},
::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds<cuda::std::int64_t{1}, k>()},
::cuda::args::immediate{segment_size, ::cuda::args::bounds<cuda::std::int64_t{1}, max_segment_size>()},
::cuda::args::immediate{k, ::cuda::args::bounds<cuda::std::int64_t{1}, k>()},
cub::detail::topk::select::min,
::cuda::__argument::__immediate{num_segments},
::cuda::__argument::__immediate{num_segments * segment_size});
::cuda::args::immediate{num_segments},
::cuda::args::immediate{num_segments * segment_size});

const int num_minus_zero = static_cast<int>(thrust::count_if(d_keys_out.begin(), d_keys_out.end(), is_minus_zero{}));
REQUIRE(num_minus_zero >= 1);
Expand Down
Loading
Loading