Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 7 additions & 26 deletions cub/benchmarks/bench/segmented_topk/fixed/keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <cub/detail/choose_offset.cuh>
#include <cub/device/dispatch/dispatch_batched_topk.cuh>

#include <cuda/__argument_>
#include <cuda/iterator>

#include <nvbench_helper.cuh>
Expand Down Expand Up @@ -44,33 +45,13 @@ void fixed_seg_size_topk_keys(
nvbench::state& state,
nvbench::type_list<KeyT, nvbench::enum_type<MaxSegmentSize>, nvbench::enum_type<MaxNumSelected>>)
{
// Range of guaranteed total number of items
constexpr auto min_num_total_items = 1;
constexpr auto max_num_total_items = ::cuda::std::numeric_limits<::cuda::std::int32_t>::max();

// Static segment size
using seg_size_t = cub::detail::batched_topk::segment_size_static<MaxSegmentSize>;

// Static k (number of selected output elements per segment)
using k_value_t = cub::detail::batched_topk::k_static<MaxNumSelected>;

// Static selection direction (max)
using select_direction_value_t = cub::detail::batched_topk::select_direction_static<cub::detail::topk::select::max>;

// Number of segments is a host-accessible value
using num_segments_uniform_t = cub::detail::batched_topk::num_segments_uniform<>;

// Total number of items guarantee type
using total_num_items_guarantee_t =
cub::detail::batched_topk::total_num_items_guarantee<min_num_total_items, max_num_total_items>;

// Retrieve axis parameters
const auto max_elements = static_cast<size_t>(state.get_int64("Elements{io}"));
const auto segment_size = static_cast<::cuda::std::ptrdiff_t>(MaxSegmentSize);
const auto selected_elements = static_cast<::cuda::std::ptrdiff_t>(MaxNumSelected);
const auto num_segments = ::cuda::std::max<std::size_t>(1, (max_elements / segment_size));
const auto elements = num_segments * segment_size;
const auto total_num_items = total_num_items_guarantee_t{static_cast<::cuda::std::int64_t>(elements)};
const auto total_num_items = ::cuda::__argument::__immediate{static_cast<::cuda::std::int64_t>(elements)};
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note to myself: I think this should become part of the guarantees API, as, in the device interface, there is no concrete argument that could be annotated.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

PR: #9278

const bit_entropy entropy = str_to_entropy(state.get_string("Entropy"));

// Skip workloads where k exceeds the segment size
Expand All @@ -87,9 +68,9 @@ void fixed_seg_size_topk_keys(
auto d_keys_in = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_in_ptr), segment_size);
auto d_keys_out = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_out_ptr), selected_elements);

auto segment_sizes = seg_size_t{};
auto k = k_value_t{};
auto select_directions = select_direction_value_t{};
auto segment_sizes = ::cuda::__argument::__constant<MaxSegmentSize>{};
auto k = ::cuda::__argument::__constant<MaxNumSelected>{};
auto select_direction = ::cuda::__argument::__constant<cub::detail::topk::select::max>{};

state.add_element_count(elements, "NumElements");
state.add_element_count(segment_size, "SegmentSize");
Expand Down Expand Up @@ -117,8 +98,8 @@ void fixed_seg_size_topk_keys(
static_cast<cub::NullType**>(nullptr),
segment_sizes,
k,
select_directions,
num_segments_uniform_t{static_cast<::cuda::std::int64_t>(num_segments)},
select_direction,
::cuda::__argument::__immediate{static_cast<::cuda::std::int64_t>(num_segments)},
total_num_items,
env);
});
Expand Down
20 changes: 9 additions & 11 deletions cub/benchmarks/bench/segmented_topk/variable/keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <thrust/reduce.h>
#include <thrust/tabulate.h>

#include <cuda/__argument_>
#include <cuda/iterator>
#include <cuda/random>
#include <cuda/std/algorithm>
Expand Down Expand Up @@ -171,20 +172,17 @@ void variable_seg_size_topk_keys(nvbench::state& state,
static_cast<cuda::std::int64_t>(MaxSegmentSize));
const auto input_elements = thrust::reduce(d_segment_sizes.begin(), d_segment_sizes.end());
const auto output_elements = static_cast<std::size_t>(num_segments) * K;
const auto total_num_items =
cub::detail::batched_topk::total_num_items_guarantee<1, cuda::std::numeric_limits<cuda::std::int64_t>::max()>{
static_cast<cuda::std::int64_t>(input_elements)};
const auto total_num_items = ::cuda::__argument::__immediate{static_cast<cuda::std::int64_t>(input_elements)};

auto in_keys_buffer = gen_data<MaxSegmentSize, K>(
num_segments, string_to_pattern(state.get_string("Pattern")), thrust::raw_pointer_cast(d_segment_sizes.data()));
auto out_keys_buffer = thrust::device_vector<KeyT>(output_elements, thrust::no_init);

cub::detail::batched_topk::segment_size_per_segment<const cuda::std::int64_t*, 1, MaxSegmentSize> segment_sizes_param{
thrust::raw_pointer_cast(d_segment_sizes.data())};
cub::detail::batched_topk::k_static<K> k_param{};
cub::detail::batched_topk::select_direction_static<cub::detail::topk::select::max> select_directions{};
cub::detail::batched_topk::num_segments_uniform<> num_segments_uniform_param{
static_cast<cuda::std::int64_t>(num_segments)};
auto segment_sizes_param = ::cuda::__argument::__immediate_sequence{
thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::__argument::__bounds<1, MaxSegmentSize>()};
auto k_param = ::cuda::__argument::__constant<K>{};
auto select_direction = ::cuda::__argument::__constant<cub::detail::topk::select::max>{};
auto num_segments_param = ::cuda::__argument::__immediate{static_cast<cuda::std::int64_t>(num_segments)};

auto d_keys_in = cuda::make_strided_iterator(
cuda::make_counting_iterator(thrust::raw_pointer_cast(in_keys_buffer.data())),
Expand All @@ -210,8 +208,8 @@ void variable_seg_size_topk_keys(nvbench::state& state,
static_cast<cub::NullType**>(nullptr),
segment_sizes_param,
k_param,
select_directions,
num_segments_uniform_param,
select_direction,
num_segments_param,
total_num_items,
env);
});
Expand Down
19 changes: 10 additions & 9 deletions cub/cub/agent/agent_batched_topk.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
#include <cub/device/dispatch/tuning/tuning_batched_topk.cuh>
#include <cub/util_type.cuh>

#include <cuda/__argument_>
#include <cuda/__cmath/ceil_div.h>

CUB_NAMESPACE_BEGIN
Expand Down Expand Up @@ -72,8 +73,8 @@ struct agent_batched_topk_worker_per_segment
using key_t = it_value_t<key_it_t>;
using value_t = it_value_t<value_it_t>;

using segment_size_val_t = typename SegmentSizeParameterT::value_type;
using num_segments_val_t = typename NumSegmentsParameterT::value_type;
using segment_size_val_t = typename ::cuda::__argument::__traits<SegmentSizeParameterT>::element_type;
using num_segments_val_t = typename ::cuda::__argument::__traits<NumSegmentsParameterT>::element_type;
Comment on lines +76 to +77
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note to myself: I think we want to narrower the index/offset/size types to the more narrow of the two: the static upper bound or the element type.

using counters_t = batched_topk_counters<num_segments_val_t>;

static constexpr auto policy = PolicyGetter{}();
Expand All @@ -94,7 +95,7 @@ struct agent_batched_topk_worker_per_segment
multi_worker_per_segment_policy.threads_per_block * multi_worker_per_segment_policy.items_per_thread;

// Check if there could be large segments present
static constexpr bool only_small_segments = params::static_max_value_v<SegmentSizeParameterT> <= tile_size;
static constexpr bool only_small_segments = ::cuda::__argument::__traits<SegmentSizeParameterT>::max <= tile_size;

// Check if we are dealing with keys-only or key-value pairs
static constexpr bool is_keys_only = ::cuda::std::is_same_v<value_t, cub::NullType>;
Expand Down Expand Up @@ -190,16 +191,16 @@ struct agent_batched_topk_worker_per_segment

// Boundary check
// TODO (elstehle): consider skipping boundary check if we can safely assume the right grid dimensions
if (segment_id >= num_segments.get_param(0))
if (segment_id >= params::get_param(num_segments, 0))
{
return;
}

constexpr bool is_full_tile = params::has_single_static_value_v<SegmentSizeParameterT>
&& params::static_min_value_v<SegmentSizeParameterT> == tile_size;
constexpr bool is_full_tile = ::cuda::__argument::__traits<SegmentSizeParameterT>::is_constant
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: For something like an __immediate where we have sharp bounds, i.e., lowest=max, we wouldn't hit the full_tile branch, right? Do you think this is something we should somehow cover or do you think we can expect users to be always be using __constant? I guess there could be a scenario where users themselves do have something like bounds template parameters and would not check for narrow bounds before instantiating something with __immediate?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we could make __immediate smarter and report is_constant when it has static bounds where lower == higher. I will take a look into it in a follow-up PR.

&& ::cuda::__argument::__traits<SegmentSizeParameterT>::lowest == tile_size;

// Resolve Segment Parameters
const auto segment_size = segment_sizes.get_param(segment_id);
const auto segment_size = params::get_param(segment_sizes, segment_id);
if (!only_small_segments && segment_size > tile_size)
{
// Enqueue large segment
Expand All @@ -215,8 +216,8 @@ struct agent_batched_topk_worker_per_segment
else
{
// Process small segment
const auto k = (::cuda::std::min) (k_param.get_param(segment_id),
static_cast<decltype(k_param.get_param(segment_id))>(segment_size));
const auto k = (::cuda::std::min) (params::get_param(k_param, segment_id),
static_cast<decltype(params::get_param(k_param, segment_id))>(segment_size));
const auto direction = select_directions.get_param(segment_id);

// Determine padding key based on direction
Expand Down
Loading
Loading