diff --git a/cub/benchmarks/bench/segmented_topk/fixed/keys.cu b/cub/benchmarks/bench/segmented_topk/fixed/keys.cu index b8f13469dce..77bd997b742 100644 --- a/cub/benchmarks/bench/segmented_topk/fixed/keys.cu +++ b/cub/benchmarks/bench/segmented_topk/fixed/keys.cu @@ -4,7 +4,7 @@ #include #include -#include +#include #include #include @@ -51,7 +51,7 @@ void fixed_seg_size_topk_keys( const auto selected_elements = static_cast<::cuda::std::ptrdiff_t>(MaxNumSelected); const auto num_segments = ::cuda::std::max(1, (max_elements / segment_size)); const auto elements = num_segments * segment_size; - const auto total_num_items = ::cuda::__argument::__immediate{static_cast<::cuda::std::int64_t>(elements)}; + const auto total_num_items = ::cuda::args::immediate{static_cast<::cuda::std::int64_t>(elements)}; const bit_entropy entropy = str_to_entropy(state.get_string("Entropy")); // Skip workloads where k exceeds the segment size @@ -68,9 +68,9 @@ void fixed_seg_size_topk_keys( auto d_keys_in = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_in_ptr), segment_size); auto d_keys_out = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_out_ptr), selected_elements); - auto segment_sizes = ::cuda::__argument::__constant{}; - auto k = ::cuda::__argument::__constant{}; - auto select_direction = ::cuda::__argument::__constant{}; + auto segment_sizes = ::cuda::args::constant{}; + auto k = ::cuda::args::constant{}; + auto select_direction = ::cuda::args::constant{}; state.add_element_count(elements, "NumElements"); state.add_element_count(segment_size, "SegmentSize"); @@ -99,7 +99,7 @@ void fixed_seg_size_topk_keys( segment_sizes, k, select_direction, - ::cuda::__argument::__immediate{static_cast<::cuda::std::int64_t>(num_segments)}, + ::cuda::args::immediate{static_cast<::cuda::std::int64_t>(num_segments)}, total_num_items, env); }); diff --git a/cub/benchmarks/bench/segmented_topk/variable/indexed.cu b/cub/benchmarks/bench/segmented_topk/variable/indexed.cu index f59b250e247..488d3aa4439 100644 --- a/cub/benchmarks/bench/segmented_topk/variable/indexed.cu +++ b/cub/benchmarks/bench/segmented_topk/variable/indexed.cu @@ -7,7 +7,7 @@ #include #include -#include +#include #include #include @@ -36,18 +36,18 @@ void decode_style_variable_topk_indexed( static_cast(MaxSegmentSize)); const auto input_elements = thrust::reduce(d_segment_sizes.begin(), d_segment_sizes.end()); const auto output_elements = static_cast(num_segments) * K; - const auto total_num_items = ::cuda::__argument::__immediate{static_cast(input_elements)}; + const auto total_num_items = ::cuda::args::immediate{static_cast(input_elements)}; auto in_keys_buffer = gen_data( num_segments, string_to_pattern(state.get_string("Pattern")), thrust::raw_pointer_cast(d_segment_sizes.data())); auto out_keys_buffer = thrust::device_vector(output_elements, thrust::no_init); auto out_indices_buffer = thrust::device_vector(output_elements, thrust::no_init); - auto segment_sizes_param = ::cuda::__argument::__immediate_sequence{ - thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::__argument::__bounds<1, MaxSegmentSize>()}; - auto k_param = ::cuda::__argument::__constant{}; - auto select_direction = ::cuda::__argument::__constant{}; - auto num_segments_param = ::cuda::__argument::__immediate{static_cast(num_segments)}; + auto segment_sizes_param = ::cuda::args::__immediate_sequence{ + thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::args::bounds<1, MaxSegmentSize>()}; + auto k_param = ::cuda::args::constant{}; + auto select_direction = ::cuda::args::constant{}; + auto num_segments_param = ::cuda::args::immediate{static_cast(num_segments)}; auto d_keys_in = cuda::make_strided_iterator( cuda::make_counting_iterator(thrust::raw_pointer_cast(in_keys_buffer.data())), diff --git a/cub/benchmarks/bench/segmented_topk/variable/keys.cu b/cub/benchmarks/bench/segmented_topk/variable/keys.cu index 001bbb4e258..5a54ad1fbaa 100644 --- a/cub/benchmarks/bench/segmented_topk/variable/keys.cu +++ b/cub/benchmarks/bench/segmented_topk/variable/keys.cu @@ -7,7 +7,7 @@ #include #include -#include +#include #include #include @@ -32,17 +32,17 @@ void decode_style_variable_topk_keys( static_cast(MaxSegmentSize)); const auto input_elements = thrust::reduce(d_segment_sizes.begin(), d_segment_sizes.end()); const auto output_elements = static_cast(num_segments) * K; - const auto total_num_items = ::cuda::__argument::__immediate{static_cast(input_elements)}; + const auto total_num_items = ::cuda::args::immediate{static_cast(input_elements)}; auto in_keys_buffer = gen_data( num_segments, string_to_pattern(state.get_string("Pattern")), thrust::raw_pointer_cast(d_segment_sizes.data())); auto out_keys_buffer = thrust::device_vector(output_elements, thrust::no_init); - auto segment_sizes_param = ::cuda::__argument::__immediate_sequence{ - thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::__argument::__bounds<1, MaxSegmentSize>()}; - auto k_param = ::cuda::__argument::__constant{}; - auto select_direction = ::cuda::__argument::__constant{}; - auto num_segments_param = ::cuda::__argument::__immediate{static_cast(num_segments)}; + auto segment_sizes_param = ::cuda::args::__immediate_sequence{ + thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::args::bounds<1, MaxSegmentSize>()}; + auto k_param = ::cuda::args::constant{}; + auto select_direction = ::cuda::args::constant{}; + auto num_segments_param = ::cuda::args::immediate{static_cast(num_segments)}; auto d_keys_in = cuda::make_strided_iterator( cuda::make_counting_iterator(thrust::raw_pointer_cast(in_keys_buffer.data())), diff --git a/cub/cub/agent/agent_batched_topk.cuh b/cub/cub/agent/agent_batched_topk.cuh index c5ec20d26cd..d2a99cfc809 100644 --- a/cub/cub/agent/agent_batched_topk.cuh +++ b/cub/cub/agent/agent_batched_topk.cuh @@ -23,8 +23,8 @@ #include #include -#include #include +#include CUB_NAMESPACE_BEGIN @@ -73,8 +73,8 @@ struct agent_batched_topk_worker_per_segment using key_t = it_value_t; using value_t = it_value_t; - using segment_size_val_t = typename ::cuda::__argument::__traits::element_type; - using num_segments_val_t = typename ::cuda::__argument::__traits::element_type; + using segment_size_val_t = typename ::cuda::args::__traits::element_type; + using num_segments_val_t = typename ::cuda::args::__traits::element_type; using counters_t = batched_topk_counters; static constexpr auto policy = PolicyGetter{}(); @@ -95,7 +95,7 @@ struct agent_batched_topk_worker_per_segment multi_worker_per_segment_policy.threads_per_block * multi_worker_per_segment_policy.items_per_thread; // Check if there could be large segments present - static constexpr bool only_small_segments = ::cuda::__argument::__traits::highest <= tile_size; + static constexpr bool only_small_segments = ::cuda::args::__traits::highest <= tile_size; // Check if we are dealing with keys-only or key-value pairs static constexpr bool is_keys_only = ::cuda::std::is_same_v; @@ -196,8 +196,8 @@ struct agent_batched_topk_worker_per_segment return; } - constexpr bool is_full_tile = ::cuda::__argument::__traits::is_constant - && ::cuda::__argument::__traits::lowest == tile_size; + constexpr bool is_full_tile = ::cuda::args::__traits::is_constant + && ::cuda::args::__traits::lowest == tile_size; // Resolve Segment Parameters const auto segment_size = params::get_param(segment_sizes, segment_id); diff --git a/cub/cub/detail/segmented_params.cuh b/cub/cub/detail/segmented_params.cuh index 543f55b5036..41ba334eaef 100644 --- a/cub/cub/detail/segmented_params.cuh +++ b/cub/cub/detail/segmented_params.cuh @@ -13,7 +13,7 @@ # pragma system_header #endif // no system header -#include +#include #include #include #include @@ -33,10 +33,10 @@ namespace detail::params //! @param[in] __index Segment index to read for sequence arguments. //! @return The single argument value, or the sequence element at the given index. _CCCL_TEMPLATE(class _Tp, class _SegmentIndexT) -_CCCL_REQUIRES((!::cuda::__argument::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) ) +_CCCL_REQUIRES((!::cuda::args::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) ) [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(_Tp&& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept { - if constexpr (::cuda::__argument::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value) + if constexpr (::cuda::args::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value) { return __arg; } @@ -46,46 +46,46 @@ _CCCL_REQUIRES((!::cuda::__argument::__is_wrapper_v<::cuda::std::remove_cvref_t< } } -template +template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto -get_param(const ::cuda::__argument::__constant<_Value>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept +get_param(const ::cuda::args::constant<_Value, _Tp>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept { - return ::cuda::__argument::__unwrap(__arg); + return ::cuda::args::__unwrap(__arg); } template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto -get_param(const ::cuda::__argument::__constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept +get_param(const ::cuda::args::__constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept { - return ::cuda::__argument::__unwrap(__arg)[__index]; + return ::cuda::args::__unwrap(__arg)[__index]; } template -[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param( - const ::cuda::__argument::__immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept +[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto +get_param(const ::cuda::args::immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept { - return ::cuda::__argument::__unwrap(__arg); + return ::cuda::args::__unwrap(__arg); } template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto -get_param(const ::cuda::__argument::__immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept +get_param(const ::cuda::args::__immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept { - return ::cuda::__argument::__unwrap(__arg)[__index]; + return ::cuda::args::__unwrap(__arg)[__index]; } template -[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param( - const ::cuda::__argument::__deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept +[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto +get_param(const ::cuda::args::deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept { - return ::cuda::__argument::__unwrap(__arg); + return ::cuda::args::__unwrap(__arg); } template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto -get_param(const ::cuda::__argument::__deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept +get_param(const ::cuda::args::deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept { - return ::cuda::__argument::__unwrap(__arg)[__index]; + return ::cuda::args::__unwrap(__arg)[__index]; } // ===================================================================== diff --git a/cub/cub/device/dispatch/dispatch_batched_topk.cuh b/cub/cub/device/dispatch/dispatch_batched_topk.cuh index 56d12268dc9..d3c0c651991 100644 --- a/cub/cub/device/dispatch/dispatch_batched_topk.cuh +++ b/cub/cub/device/dispatch/dispatch_batched_topk.cuh @@ -31,10 +31,10 @@ #include -#include #include #include #include +#include #include #include #include @@ -50,28 +50,28 @@ namespace detail::batched_topk // Internal: wrap the compile-time select direction into a discrete param for dispatch // ----------------------------------------------------------------------------- -// The selection direction is compile-time only: callers pass `::cuda::__argument::__constant`, which maps to a +// The selection direction is compile-time only: callers pass `::cuda::args::constant`, which maps to a // value-less static_discrete_param. Because the direction is fixed at compile time and carries no runtime value, it // can never disagree with its only supported option, so dispatch can never silently degrade to a no-op. -template -[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::__argument::__constant) +template +[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::args::constant) { return params::static_discrete_param{}; } -// The selection direction is intentionally a compile-time constant: only `::cuda::__argument::__constant` is +// The selection direction is intentionally a compile-time constant: only `::cuda::args::constant` is // accepted (the overload above maps it to a value-less static_discrete_param). This catch-all documents that // deliberate limitation and rejects anything else (e.g. a runtime `detail::topk::select` or a per-segment iterator of // directions) with a clear diagnostic. It is an intent/documentation guard rather than a user-facing one: callers // reach the algorithm through the min/max device entry points (DeviceBatchedTopK::{Max,Min}{Keys,Pairs}), which -// construct the matching `__constant` internally, so `dispatch` is only ever invoked with a direction we create. +// construct the matching `constant` internally, so `dispatch` is only ever invoked with a direction we create. template [[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(SelectDirectionT) { static_assert(::cuda::std::__always_false_v, "DeviceBatchedTopK currently supports only compile-time selection directions: the min/max entry " "points (DeviceBatchedTopK::{Max,Min}{Keys,Pairs}) dispatch with a " - "::cuda::__argument::__constant; runtime or per-segment directions are " + "::cuda::args::constant; runtime or per-segment directions are " "intentionally not supported"); // Unreachable (the static_assert above always fires); keeps the return type well-formed so the only diagnostic is // the message above. @@ -131,7 +131,7 @@ template >, it_value_t>, ::cuda::std::int64_t, - ::cuda::__argument::__traits::highest>> + ::cuda::args::__traits::highest>> #if _CCCL_HAS_CONCEPTS() requires batched_topk_policy_selector #endif // _CCCL_HAS_CONCEPTS() @@ -150,7 +150,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( cudaStream_t stream = nullptr, [[maybe_unused]] PolicySelector policy_selector = {}) { - using large_segment_tile_offset_t = typename ::cuda::__argument::__traits::element_type; + using large_segment_tile_offset_t = typename ::cuda::args::__traits::element_type; // Wrap the raw enum into the internal discrete param type auto select_directions = wrap_select_direction(select_direction); @@ -176,9 +176,9 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( static constexpr int worker_per_segment_tile_size = worker_per_segment_policy.threads_per_block * worker_per_segment_policy.items_per_thread; static constexpr bool any_small_segments = - ::cuda::__argument::__traits::lowest <= worker_per_segment_tile_size; + ::cuda::args::__traits::lowest <= worker_per_segment_tile_size; static constexpr bool only_small_segments = - ::cuda::__argument::__traits::highest <= worker_per_segment_tile_size; + ::cuda::args::__traits::highest <= worker_per_segment_tile_size; // Allocation layout: // only_small_segments: [0] dummy. @@ -188,7 +188,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( static constexpr int allocations_array_size = only_small_segments ? 1 : (any_small_segments ? 3 : 2); size_t allocation_sizes[allocations_array_size] = {1}; - using num_segments_val_t = typename ::cuda::__argument::__traits::element_type; + using num_segments_val_t = typename ::cuda::args::__traits::element_type; using counters_t = batched_topk_counters; using segment_size_scan_offset_t = detail::choose_offset_t; using segment_size_scan_input_op_t = @@ -244,7 +244,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( // TODO (elstehle): support number of segments provided by device-accessible iterator // Only uniform number of segments are supported (i.e., we need to resolve the number of segments on the host) - static_assert(::cuda::__argument::__traits::is_single_value, + static_assert(::cuda::args::__traits::is_single_value, "Only uniform segment sizes are currently supported."); if constexpr (any_small_segments) @@ -346,7 +346,7 @@ template >, it_value_t>, ::cuda::std::int64_t, - ::cuda::__argument::__traits::highest>; + ::cuda::args::__traits::highest>; return detail::dispatch_with_env_and_tuning( env, [&](auto policy_selector, void* d_temp_storage, size_t& temp_storage_bytes, cudaStream_t stream) { return dispatch( diff --git a/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh b/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh index 1ff50dfaf67..3412f40359a 100644 --- a/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh @@ -20,8 +20,8 @@ #include #include -#include #include +#include CUB_NAMESPACE_BEGIN @@ -39,7 +39,7 @@ private: worker_policy worker_per_segment_policy; multi_worker_policy multi_worker_per_segment_policy; }; - static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::__argument::__traits::highest; + static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::args::__traits::highest; static constexpr batched_topk_policy active_policy = current_policy(); template @@ -133,8 +133,8 @@ __launch_bounds__(int( KParameterT k, SelectDirectionParameterT select_directions, NumSegmentsParameterT num_segments, - batched_topk_counters::element_type>* d_counters, - typename ::cuda::__argument::__traits::element_type* d_large_segments_ids, + batched_topk_counters::element_type>* d_counters, + typename ::cuda::args::__traits::element_type* d_large_segments_ids, LargeSegmentTileOffsetT* d_large_segments_tile_offsets) { using agent_t = typename find_smallest_covering_policy< @@ -151,7 +151,7 @@ __launch_bounds__(int( LargeSegmentTileOffsetT>::agent_t; // Static Assertions (Constraints) - static_assert(agent_t::tile_size >= ::cuda::__argument::__traits::highest, + static_assert(agent_t::tile_size >= ::cuda::args::__traits::highest, "Block size exceeds maximum segment size supported by SegmentSizeParameterT"); static_assert(sizeof(typename agent_t::TempStorage) <= max_smem_per_block, "Static shared memory per block must not exceed 48KB limit."); diff --git a/cub/test/catch2_test_device_segmented_topk_keys.cu b/cub/test/catch2_test_device_segmented_topk_keys.cu index 3ef76bc2743..fe3e97188ac 100644 --- a/cub/test/catch2_test_device_segmented_topk_keys.cu +++ b/cub/test/catch2_test_device_segmented_topk_keys.cu @@ -27,11 +27,11 @@ struct is_minus_zero } }; -template CUB_RUNTIME_FUNCTION static cudaError_t dispatch_batched_topk_keys( @@ -41,7 +41,6 @@ CUB_RUNTIME_FUNCTION static cudaError_t dispatch_batched_topk_keys( KeyOutputItItT d_key_segments_out_it, SegmentSizeParamT segment_sizes, KParamT k, - SelectDirectionT select_direction, NumSegmentsParameterT num_segments, TotalNumItemsGuaranteeT total_num_items_guarantee, cudaStream_t stream = nullptr) @@ -56,14 +55,15 @@ CUB_RUNTIME_FUNCTION static cudaError_t dispatch_batched_topk_keys( values_it, segment_sizes, k, - select_direction, + ::cuda::args::constant{}, num_segments, total_num_items_guarantee, stream); } // %PARAM% TEST_LAUNCH lid 0:1:2 -DECLARE_LAUNCH_WRAPPER(dispatch_batched_topk_keys, batched_topk_keys); +DECLARE_TMPL_LAUNCH_WRAPPER( + dispatch_batched_topk_keys, batched_topk_keys, cub::detail::topk::select Direction, Direction); // Total segment size using max_segment_size_list = c2h::enum_type_list; @@ -153,14 +153,13 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small fixed-size segments", c2h::device_vector expected_keys(keys_in_buffer); // Run the top-k algorithm - batched_topk_keys( + batched_topk_keys( d_keys_in, d_keys_out, - ::cuda::__argument::__immediate{segment_size, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__constant{}, - ::cuda::__argument::__immediate{num_segments}, - ::cuda::__argument::__immediate{num_segments * segment_size}); + ::cuda::args::immediate{segment_size, ::cuda::args::bounds()}, + ::cuda::args::immediate{k, ::cuda::args::bounds()}, + ::cuda::args::immediate{num_segments}, + ::cuda::args::immediate{num_segments * segment_size}); // Prepare expected results fixed_size_segmented_sort_keys(expected_keys, num_segments, segment_size, direction); compact_sorted_keys_to_topk(expected_keys, segment_size, k); @@ -251,15 +250,14 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small variable-size segment c2h::device_vector expected_keys(keys_in_buffer); // Run the top-k algorithm - batched_topk_keys( + batched_topk_keys( d_keys_in, d_keys_out, - ::cuda::__argument::__immediate_sequence{ - segment_size_it, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__constant{}, - ::cuda::__argument::__immediate{num_segments}, - ::cuda::__argument::__immediate{num_items}); + ::cuda::args::__immediate_sequence{ + segment_size_it, ::cuda::args::bounds()}, + ::cuda::args::immediate{k, ::cuda::args::bounds()}, + ::cuda::args::immediate{num_segments}, + ::cuda::args::immediate{num_items}); // Verify keys are returned correctly: sort each segment of the expected input, then compact the top-k segmented_sort_keys(expected_keys, num_segments, segment_offsets.cbegin(), segment_offsets.cbegin() + 1, direction); @@ -289,15 +287,13 @@ C2H_TEST("DeviceBatchedTopK::MinKeys preserves -0.0f in output", "[keys][segment auto d_keys_out_it = cuda::make_strided_iterator(cuda::make_counting_iterator(thrust::raw_pointer_cast(d_keys_out.data())), k); - batched_topk_keys( + batched_topk_keys( d_keys_in_it, d_keys_out_it, - ::cuda::__argument::__immediate{ - segment_size, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__constant{}, - ::cuda::__argument::__immediate{num_segments}, - ::cuda::__argument::__immediate{num_segments * segment_size}); + ::cuda::args::immediate{segment_size, ::cuda::args::bounds()}, + ::cuda::args::immediate{k, ::cuda::args::bounds()}, + ::cuda::args::immediate{num_segments}, + ::cuda::args::immediate{num_segments * segment_size}); const int num_minus_zero = static_cast(thrust::count_if(d_keys_out.begin(), d_keys_out.end(), is_minus_zero{})); REQUIRE(num_minus_zero >= 1); diff --git a/cub/test/catch2_test_device_segmented_topk_pairs.cu b/cub/test/catch2_test_device_segmented_topk_pairs.cu index b16a97b2472..6ba474daf81 100644 --- a/cub/test/catch2_test_device_segmented_topk_pairs.cu +++ b/cub/test/catch2_test_device_segmented_topk_pairs.cu @@ -47,8 +47,46 @@ struct flag_intra_segment_duplicates template flag_intra_segment_duplicates(ItemItT, SegIdItT) -> flag_intra_segment_duplicates; +template +CUB_RUNTIME_FUNCTION static cudaError_t dispatch_batched_topk_pairs( + void* d_temp_storage, + size_t& temp_storage_bytes, + KeyInputItItT d_key_segments_it, + KeyOutputItItT d_key_segments_out_it, + ValueInputItItT d_value_segments_it, + ValueOutputItItT d_value_segments_out_it, + SegmentSizeParameterT segment_sizes, + KParameterT k, + NumSegmentsParameterT num_segments, + TotalNumItemsGuaranteeT total_num_items_guarantee, + cudaStream_t stream = nullptr) +{ + return cub::detail::batched_topk::dispatch( + d_temp_storage, + temp_storage_bytes, + d_key_segments_it, + d_key_segments_out_it, + d_value_segments_it, + d_value_segments_out_it, + segment_sizes, + k, + ::cuda::args::constant{}, + num_segments, + total_num_items_guarantee, + stream); +} + // %PARAM% TEST_LAUNCH lid 0:1:2 -DECLARE_LAUNCH_WRAPPER(cub::detail::batched_topk::dispatch, batched_topk_pairs); +DECLARE_TMPL_LAUNCH_WRAPPER( + dispatch_batched_topk_pairs, batched_topk_pairs, cub::detail::topk::select Direction, Direction); // Total segment size using max_segment_size_list = c2h::enum_type_list; @@ -220,16 +258,15 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small fixed-size segments" c2h::device_vector expected_keys(keys_in_buffer); // Run the top-k algorithm - batched_topk_pairs( + batched_topk_pairs( d_keys_in, d_keys_out, d_values_in, d_values_out, - ::cuda::__argument::__immediate{segment_size, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__constant{}, - ::cuda::__argument::__immediate{num_segments}, - ::cuda::__argument::__immediate{num_segments * segment_size}); + ::cuda::args::immediate{segment_size, ::cuda::args::bounds()}, + ::cuda::args::immediate{k, ::cuda::args::bounds()}, + ::cuda::args::immediate{num_segments}, + ::cuda::args::immediate{num_segments * segment_size}); // Verification: // - We verify correct top-k selection through the keys @@ -341,17 +378,16 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small variable-size segmen c2h::device_vector expected_keys(keys_in_buffer); // Run the top-k algorithm - batched_topk_pairs( + batched_topk_pairs( d_keys_in, d_keys_out, d_values_in, d_values_out, - ::cuda::__argument::__immediate_sequence{ - segment_size_it, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__constant{}, - ::cuda::__argument::__immediate{num_segments}, - ::cuda::__argument::__immediate{num_items}); + ::cuda::args::__immediate_sequence{ + segment_size_it, ::cuda::args::bounds()}, + ::cuda::args::immediate{k, ::cuda::args::bounds()}, + ::cuda::args::immediate{num_segments}, + ::cuda::args::immediate{num_items}); // Verification: // - We verify correct top-k selection through the keys diff --git a/libcudacxx/include/cuda/__argument/argument.h b/libcudacxx/include/cuda/__argument/argument.h index a1d55db5045..0877a8df6d1 100644 --- a/libcudacxx/include/cuda/__argument/argument.h +++ b/libcudacxx/include/cuda/__argument/argument.h @@ -47,6 +47,8 @@ _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT +struct __access; + // ===================================================================== // __element_type_of // ===================================================================== @@ -87,7 +89,7 @@ inline constexpr bool __is_sequence_v = || ::cuda::std::__has_random_access_traversal<_Tp>; // ===================================================================== -// __constant +// constant // ===================================================================== // Non-sequence wrappers intentionally do not reject types with a distinct element type. @@ -95,51 +97,179 @@ inline constexpr bool __is_sequence_v = // spelling carries that intent. //! @brief Wraps a compile-time constant argument value. -template -struct __constant +template +class constant { - using value_type = ::cuda::std::remove_cvref_t; +public: + using value_type = ::cuda::std::remove_cvref_t<_Tp>; using __element_type = value_type; - [[nodiscard]] _CCCL_API static constexpr value_type value() noexcept + [[nodiscard]] _CCCL_API static constexpr value_type __get_value() noexcept { - return _Value; + return static_cast(_Value); } }; //! @brief Wraps a compile-time constant argument sequence. template -struct __constant_sequence +class __constant_sequence { +public: using value_type = ::cuda::std::remove_cvref_t; using __element_type = __element_type_of_t; static_assert(__is_sequence_v, "The value type of __constant_sequence must be a sequence"); +}; + +// __assert_in_range +// ===================================================================== - [[nodiscard]] _CCCL_API static constexpr value_type value() noexcept +template +_CCCL_API constexpr void __assert_in_range([[maybe_unused]] _From __val) noexcept +{ + if constexpr (::cuda::std::__cccl_is_cv_integer_v<_To> && ::cuda::std::__cccl_is_cv_integer_v<_From>) { - return _Value; + _CCCL_ASSERT(::cuda::std::in_range<::cuda::std::remove_cv_t<_To>>(__val), + "runtime bound value overflows the element type"); } -}; +} + +template +[[nodiscard]] _CCCL_API constexpr _To __runtime_bound_cast(_From __val) noexcept +{ + __assert_in_range<_To>(__val); + return static_cast<_To>(__val); +} + +template +_CCCL_API constexpr bool __static_bound_in_range() noexcept +{ + using _RawTo = ::cuda::std::remove_cv_t<_To>; + using _RawFrom = ::cuda::std::remove_cv_t; + + if constexpr (::cuda::std::__cccl_is_integer_v<_RawTo> && ::cuda::std::__cccl_is_integer_v<_RawFrom>) + { + return ::cuda::std::in_range<_RawTo>(_Value); + } + else if constexpr (::cuda::std::is_arithmetic_v<_RawTo> && ::cuda::std::is_arithmetic_v<_RawFrom>) + { + return static_cast<_RawFrom>(static_cast<_RawTo>(_Value)) == _Value; + } + else + { + return true; + } +} + +template +inline constexpr bool __valid_static_bounds_v = false; + +template +inline constexpr bool __valid_static_bounds_v<_ElementType, no_bounds> = true; + +template +inline constexpr bool __valid_static_bounds_v<_ElementType, static_bounds<_Lowest, _Highest>> = + __static_bound_in_range<_ElementType, _Lowest>() && __static_bound_in_range<_ElementType, _Highest>(); + +template +_CCCL_API constexpr _ElementType __wrapper_static_lowest() noexcept +{ + if constexpr (::cuda::std::is_same_v<_StaticBounds, no_bounds>) + { + return ::cuda::std::numeric_limits<_ElementType>::lowest(); + } + else + { + return static_cast<_ElementType>(_StaticBounds::lower()); + } +} + +template +_CCCL_API constexpr _ElementType __wrapper_static_highest() noexcept +{ + if constexpr (::cuda::std::is_same_v<_StaticBounds, no_bounds>) + { + return (::cuda::std::numeric_limits<_ElementType>::max)(); + } + else + { + return static_cast<_ElementType>(_StaticBounds::upper()); + } +} + +template +_CCCL_API constexpr _ElementType __effective_lowest(runtime_bounds<_ElementType> __runtime_bounds) noexcept +{ + auto __static_lowest = __wrapper_static_lowest<_ElementType, _StaticBounds>(); + return __static_lowest > __runtime_bounds.lower() ? __static_lowest : __runtime_bounds.lower(); +} + +template +_CCCL_API constexpr _ElementType __effective_highest(runtime_bounds<_ElementType> __runtime_bounds) noexcept +{ + auto __static_highest = __wrapper_static_highest<_ElementType, _StaticBounds>(); + return __static_highest < __runtime_bounds.upper() ? __static_highest : __runtime_bounds.upper(); +} + +template +_CCCL_API constexpr bool __has_bounds_intersection(runtime_bounds<_ElementType> __runtime_bounds) noexcept +{ + return __effective_lowest<_ElementType, _StaticBounds>(__runtime_bounds) + <= __effective_highest<_ElementType, _StaticBounds>(__runtime_bounds); +} + +template +_CCCL_API constexpr void __validate_bounds_intersection(runtime_bounds<_ElementType> __runtime_bounds) noexcept +{ + static_assert(__valid_static_bounds_v<_ElementType, _StaticBounds>, + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " + "values representable by the element type"); + _CCCL_VERIFY((__has_bounds_intersection<_ElementType, _StaticBounds>(__runtime_bounds)), + "static and runtime argument bounds do not intersect"); +} + +template +_CCCL_API constexpr void __validate_static_element_bounds([[maybe_unused]] const _ElementType& __val) noexcept +{ + if constexpr (!::cuda::std::is_same_v<_StaticBounds, no_bounds>) + { + _CCCL_ASSERT((__val >= __wrapper_static_lowest<_ElementType, _StaticBounds>()), + "immediate argument value is below static lowest bound"); + _CCCL_ASSERT((__val <= __wrapper_static_highest<_ElementType, _StaticBounds>()), + "immediate argument value is above static highest bound"); + } +} + +template +_CCCL_API constexpr void __validate_runtime_element_bounds( + [[maybe_unused]] const _ElementType& __val, [[maybe_unused]] runtime_bounds<_ElementType> __runtime_bounds) noexcept +{ + _CCCL_ASSERT((__val >= __runtime_bounds.lower()), "immediate argument value is below runtime lower bound"); + _CCCL_ASSERT((__val <= __runtime_bounds.upper()), "immediate argument value is above runtime upper bound"); +} // ===================================================================== -// __immediate +// immediate // ===================================================================== //! @brief Wraps a runtime argument value with optional bounds. //! //! The value is host-accessible at API call time. -template -struct __immediate +template +class immediate { +public: using __element_type = __element_type_of_t<_Arg>; static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>, - "static argument bounds cannot be represented by the element type"); + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " + "values representable by the element type"); + +private: + friend struct __access; _Arg __arg_; -private: _CCCL_API constexpr void __validate_value() const noexcept { if constexpr (::cuda::std::is_same_v<::cuda::std::remove_cvref_t<_Arg>, __element_type> @@ -150,13 +280,13 @@ struct __immediate } public: - _CCCL_API constexpr __immediate(_Arg __arg) noexcept + _CCCL_API constexpr immediate(_Arg __arg) noexcept : __arg_{::cuda::std::move(__arg)} { __validate_value(); } - _CCCL_API constexpr __immediate(_Arg __arg, _StaticBounds) noexcept + _CCCL_API constexpr immediate(_Arg __arg, _StaticBounds) noexcept : __arg_{::cuda::std::move(__arg)} { __validate_value(); @@ -165,8 +295,8 @@ struct __immediate #ifndef _CCCL_DOXYGEN_INVOKED template -_CCCL_HOST_DEVICE __immediate(_Arg, __static_bounds<_Lowest, _Highest>) - -> __immediate<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE immediate(_Arg, static_bounds<_Lowest, _Highest>) + -> immediate<_Arg, static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED @@ -175,19 +305,23 @@ _CCCL_HOST_DEVICE __immediate(_Arg, __static_bounds<_Lowest, _Highest>) // ===================================================================== //! @brief Wraps a runtime argument sequence with optional bounds. -template -struct __immediate_sequence +template +class __immediate_sequence { +public: using __element_type = __element_type_of_t<_Arg>; static_assert(__is_sequence_v<_Arg>, "immediate sequence arguments must have a distinct element type"); static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>, - "static argument bounds cannot be represented by the element type"); + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " + "values representable by the element type"); + +private: + friend struct __access; _Arg __arg_; - __runtime_bounds<__element_type> __runtime_bounds_{}; + runtime_bounds<__element_type> __runtime_bounds_{}; -private: _CCCL_API constexpr void __validate_bounds() const noexcept { __validate_bounds_intersection<__element_type, _StaticBounds>(__runtime_bounds_); @@ -230,7 +364,7 @@ struct __immediate_sequence } template - _CCCL_API constexpr __immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb) noexcept + _CCCL_API constexpr __immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb) noexcept : __arg_{::cuda::std::move(__arg)} , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()), __runtime_bound_cast<__element_type>(__rb.upper())} @@ -240,7 +374,7 @@ struct __immediate_sequence } template - _CCCL_API constexpr __immediate_sequence(_Arg __arg, _StaticBounds, __runtime_bounds<_BoundsTp> __rb) noexcept + _CCCL_API constexpr __immediate_sequence(_Arg __arg, _StaticBounds, runtime_bounds<_BoundsTp> __rb) noexcept : __arg_{::cuda::std::move(__arg)} , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()), __runtime_bound_cast<__element_type>(__rb.upper())} @@ -250,41 +384,47 @@ struct __immediate_sequence } template - _CCCL_API constexpr __immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept + _CCCL_API constexpr __immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept : __immediate_sequence(::cuda::std::move(__arg), __sb, __rb) {} }; #ifndef _CCCL_DOXYGEN_INVOKED template -_CCCL_HOST_DEVICE __immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>) - -> __immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE __immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>) + -> __immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE __immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>) - -> __immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE __immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>, runtime_bounds<_Tp>) + -> __immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE __immediate_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>) - -> __immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE __immediate_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Highest>) + -> __immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED // ===================================================================== -// __deferred_base / __deferred / __deferred_sequence +// __deferred_base / deferred / deferred_sequence // ===================================================================== //! @brief Common base for deferred argument wrappers. -template -struct __deferred_base +template +class __deferred_base { +public: using __element_type = __element_type_of_t<_Arg>; static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>, - "static argument bounds cannot be represented by the element type"); + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " + "values representable by the element type"); + +private: + friend struct __access; _Arg __arg_; - __runtime_bounds<__element_type> __runtime_bounds_{}; + runtime_bounds<__element_type> __runtime_bounds_{}; +public: _CCCL_API constexpr __deferred_base(_Arg __arg) noexcept : __arg_{::cuda::std::move(__arg)} { @@ -298,7 +438,7 @@ struct __deferred_base } template - _CCCL_API constexpr __deferred_base(_Arg __arg, __runtime_bounds<_BoundsTp> __rb) noexcept + _CCCL_API constexpr __deferred_base(_Arg __arg, runtime_bounds<_BoundsTp> __rb) noexcept : __arg_{::cuda::std::move(__arg)} , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()), __runtime_bound_cast<__element_type>(__rb.upper())} @@ -307,7 +447,7 @@ struct __deferred_base } template - _CCCL_API constexpr __deferred_base(_Arg __arg, _StaticBounds, __runtime_bounds<_BoundsTp> __rb) noexcept + _CCCL_API constexpr __deferred_base(_Arg __arg, _StaticBounds, runtime_bounds<_BoundsTp> __rb) noexcept : __arg_{::cuda::std::move(__arg)} , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()), __runtime_bound_cast<__element_type>(__rb.upper())} @@ -316,67 +456,161 @@ struct __deferred_base } template - _CCCL_API constexpr __deferred_base(_Arg __arg, __runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept + _CCCL_API constexpr __deferred_base(_Arg __arg, runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept : __deferred_base(::cuda::std::move(__arg), __sb, __rb) {} }; //! @brief Wraps a reference to a single value that is potentially not available at API call time but will be available //! by the time the argument is consumed in stream order. -template -struct __deferred : __deferred_base<_Arg, _StaticBounds> +template +class deferred : public __deferred_base<_Arg, _StaticBounds> { +public: using __deferred_base<_Arg, _StaticBounds>::__deferred_base; }; #ifndef _CCCL_DOXYGEN_INVOKED template -_CCCL_HOST_DEVICE __deferred(_Arg) -> __deferred<_Arg>; +_CCCL_HOST_DEVICE deferred(_Arg) -> deferred<_Arg>; template -_CCCL_HOST_DEVICE __deferred(_Arg, __static_bounds<_Lowest, _Highest>) - -> __deferred<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred(_Arg, static_bounds<_Lowest, _Highest>) -> deferred<_Arg, static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE __deferred(_Arg, __runtime_bounds<_Tp>) -> __deferred<_Arg>; +_CCCL_HOST_DEVICE deferred(_Arg, runtime_bounds<_Tp>) -> deferred<_Arg>; template -_CCCL_HOST_DEVICE __deferred(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>) - -> __deferred<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred(_Arg, static_bounds<_Lowest, _Highest>, runtime_bounds<_Tp>) + -> deferred<_Arg, static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE __deferred(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>) - -> __deferred<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Highest>) + -> deferred<_Arg, static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED //! @brief Wraps a reference to a sequence of values that is potentially not available at API call time but will be //! available by the time the argument is consumed in stream order. -template -struct __deferred_sequence : __deferred_base<_Arg, _StaticBounds> +template +class deferred_sequence : public __deferred_base<_Arg, _StaticBounds> { +public: + static_assert(__is_sequence_v<_Arg>, "deferred sequence arguments must have a distinct element type"); + using __deferred_base<_Arg, _StaticBounds>::__deferred_base; }; #ifndef _CCCL_DOXYGEN_INVOKED template -_CCCL_HOST_DEVICE __deferred_sequence(_Arg) -> __deferred_sequence<_Arg>; +_CCCL_HOST_DEVICE deferred_sequence(_Arg) -> deferred_sequence<_Arg>; template -_CCCL_HOST_DEVICE __deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>) - -> __deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred_sequence(_Arg, static_bounds<_Lowest, _Highest>) + -> deferred_sequence<_Arg, static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE __deferred_sequence(_Arg, __runtime_bounds<_Tp>) -> __deferred_sequence<_Arg>; +_CCCL_HOST_DEVICE deferred_sequence(_Arg, runtime_bounds<_Tp>) -> deferred_sequence<_Arg>; template -_CCCL_HOST_DEVICE __deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>) - -> __deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred_sequence(_Arg, static_bounds<_Lowest, _Highest>, runtime_bounds<_Tp>) + -> deferred_sequence<_Arg, static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE __deferred_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>) - -> __deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Highest>) + -> deferred_sequence<_Arg, static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED +// ===================================================================== +// __access +// ===================================================================== + +struct __access +{ + template + [[nodiscard]] _CCCL_API static constexpr _Arg& __arg(immediate<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__arg_; + } + + template + [[nodiscard]] _CCCL_API static constexpr const _Arg& __arg(const immediate<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__arg_; + } + + template + [[nodiscard]] _CCCL_API static constexpr _Arg&& __arg(immediate<_Arg, _StaticBounds>&& __wrapper) noexcept + { + return ::cuda::std::move(__wrapper.__arg_); + } + + template + [[nodiscard]] _CCCL_API static constexpr _Arg& __arg(__immediate_sequence<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__arg_; + } + + template + [[nodiscard]] _CCCL_API static constexpr const _Arg& + __arg(const __immediate_sequence<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__arg_; + } + + template + [[nodiscard]] _CCCL_API static constexpr _Arg&& __arg(__immediate_sequence<_Arg, _StaticBounds>&& __wrapper) noexcept + { + return ::cuda::std::move(__wrapper.__arg_); + } + + template + [[nodiscard]] _CCCL_API static constexpr _Arg& __arg(__deferred_base<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__arg_; + } + + template + [[nodiscard]] _CCCL_API static constexpr const _Arg& + __arg(const __deferred_base<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__arg_; + } + + template + [[nodiscard]] _CCCL_API static constexpr _Arg&& __arg(__deferred_base<_Arg, _StaticBounds>&& __wrapper) noexcept + { + return ::cuda::std::move(__wrapper.__arg_); + } + + template + [[nodiscard]] _CCCL_API static constexpr runtime_bounds<__element_type_of_t<_Arg>>& + __runtime_bounds(__immediate_sequence<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__runtime_bounds_; + } + + template + [[nodiscard]] _CCCL_API static constexpr const runtime_bounds<__element_type_of_t<_Arg>>& + __runtime_bounds(const __immediate_sequence<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__runtime_bounds_; + } + + template + [[nodiscard]] _CCCL_API static constexpr runtime_bounds<__element_type_of_t<_Arg>>& + __runtime_bounds(__deferred_base<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__runtime_bounds_; + } + + template + [[nodiscard]] _CCCL_API static constexpr const runtime_bounds<__element_type_of_t<_Arg>>& + __runtime_bounds(const __deferred_base<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__runtime_bounds_; + } +}; + // ===================================================================== // __unwrap // ===================================================================== @@ -384,17 +618,17 @@ _CCCL_HOST_DEVICE __deferred_sequence(_Arg, __runtime_bounds<_Tp>, __static_boun template inline constexpr bool __is_wrapper_v = false; template -inline constexpr bool __is_wrapper_v<__immediate<_Arg, _StaticBounds>> = true; -template -inline constexpr bool __is_wrapper_v<__constant<_Value>> = true; +inline constexpr bool __is_wrapper_v> = true; +template +inline constexpr bool __is_wrapper_v> = true; template inline constexpr bool __is_wrapper_v<__constant_sequence<_Value>> = true; template inline constexpr bool __is_wrapper_v<__immediate_sequence<_Arg, _StaticBounds>> = true; template -inline constexpr bool __is_wrapper_v<__deferred<_Arg, _StaticBounds>> = true; +inline constexpr bool __is_wrapper_v> = true; template -inline constexpr bool __is_wrapper_v<__deferred_sequence<_Arg, _StaticBounds>> = true; +inline constexpr bool __is_wrapper_v> = true; _CCCL_TEMPLATE(class _Tp) _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) ) @@ -404,28 +638,28 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) ) } template -[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__immediate<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(immediate<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template -[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __immediate<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const immediate<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template -[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__immediate<_Arg, _StaticBounds>&& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(immediate<_Arg, _StaticBounds>&& __arg) noexcept { - return ::cuda::std::move(__arg.__arg_); + return __access::__arg(::cuda::std::move(__arg)); } -template -[[nodiscard]] _CCCL_API constexpr ::cuda::std::remove_cvref_t -__unwrap(const __constant<_Value>&) noexcept +template +[[nodiscard]] _CCCL_API constexpr typename constant<_Value, _Tp>::value_type +__unwrap(const constant<_Value, _Tp>&) noexcept { - return _Value; + return constant<_Value, _Tp>::__get_value(); } template @@ -438,67 +672,67 @@ __unwrap(const __constant_sequence<_Value>&) noexcept template [[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template [[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template [[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__immediate_sequence<_Arg, _StaticBounds>&& __arg) noexcept { - return ::cuda::std::move(__arg.__arg_); + return __access::__arg(::cuda::std::move(__arg)); } template -[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__deferred<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(deferred<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template -[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __deferred<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const deferred<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template -[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__deferred<_Arg, _StaticBounds>&& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(deferred<_Arg, _StaticBounds>&& __arg) noexcept { - return ::cuda::std::move(__arg.__arg_); + return __access::__arg(::cuda::std::move(__arg)); } template -[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template -[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template -[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__deferred_sequence<_Arg, _StaticBounds>&& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(deferred_sequence<_Arg, _StaticBounds>&& __arg) noexcept { - return ::cuda::std::move(__arg.__arg_); + return __access::__arg(::cuda::std::move(__arg)); } -template +template _CCCL_API constexpr auto __constant_compute_lowest() noexcept { - return _Value; + return constant<_Value, _Tp>::__get_value(); } -template +template _CCCL_API constexpr auto __constant_compute_highest() noexcept { - return _Value; + return constant<_Value, _Tp>::__get_value(); } template @@ -549,25 +783,26 @@ struct __traits_impl static constexpr element_type highest = (::cuda::std::numeric_limits::max)(); }; -template -struct __traits_impl<__constant<_Value>> +template +struct __traits_impl> { - using value_type = ::cuda::std::remove_cvref_t; + using value_type = typename constant<_Value, _Tp>::value_type; using element_type = value_type; static constexpr bool is_constant = true; static constexpr bool is_deferred = false; static constexpr bool is_single_value = true; - static constexpr element_type lowest = __constant_compute_lowest<_Value>(); - static constexpr element_type highest = __constant_compute_highest<_Value>(); + static constexpr element_type lowest = __constant_compute_lowest<_Value, _Tp>(); + static constexpr element_type highest = __constant_compute_highest<_Value, _Tp>(); }; template -struct __traits_impl<__immediate<_Arg, _StaticBounds>> +struct __traits_impl> { using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; static_assert(__valid_static_bounds_v, - "static argument bounds cannot be represented by the element type"); + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " + "values representable by the element type"); static constexpr bool is_constant = false; static constexpr bool is_deferred = false; @@ -596,7 +831,8 @@ struct __traits_impl<__immediate_sequence<_Arg, _StaticBounds>> using element_type = __element_type_of_t<_Arg>; static_assert(__is_sequence_v, "immediate sequence arguments must have a distinct element type"); static_assert(__valid_static_bounds_v, - "static argument bounds cannot be represented by the element type"); + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " + "values representable by the element type"); static constexpr bool is_constant = false; static constexpr bool is_deferred = false; @@ -606,12 +842,13 @@ struct __traits_impl<__immediate_sequence<_Arg, _StaticBounds>> }; template -struct __traits_impl<__deferred<_Arg, _StaticBounds>> +struct __traits_impl> { using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; static_assert(__valid_static_bounds_v, - "static argument bounds cannot be represented by the element type"); + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " + "values representable by the element type"); static constexpr bool is_constant = false; static constexpr bool is_deferred = true; @@ -621,12 +858,14 @@ struct __traits_impl<__deferred<_Arg, _StaticBounds>> }; template -struct __traits_impl<__deferred_sequence<_Arg, _StaticBounds>> +struct __traits_impl> { using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; + static_assert(__is_sequence_v, "deferred sequence arguments must have a distinct element type"); static_assert(__valid_static_bounds_v, - "static argument bounds cannot be represented by the element type"); + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " + "values representable by the element type"); static constexpr bool is_constant = false; static constexpr bool is_deferred = true; @@ -651,10 +890,10 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cv_t<_Tp>>) ) return ::cuda::std::numeric_limits<__element_type_of_t<_Tp>>::lowest(); } -template -[[nodiscard]] _CCCL_API constexpr auto __lowest_(__constant<_Value>) noexcept +template +[[nodiscard]] _CCCL_API constexpr auto __lowest_(constant<_Value, _Tp>) noexcept { - return __constant_compute_lowest<_Value>(); + return __constant_compute_lowest<_Value, _Tp>(); } template @@ -664,33 +903,36 @@ template } template -[[nodiscard]] _CCCL_API constexpr auto __lowest_(__immediate<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __lowest_(immediate<_Arg, _StaticBounds> __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template [[nodiscard]] _CCCL_API constexpr auto __lowest_(__immediate_sequence<_Arg, _StaticBounds> __arg) noexcept { - using _ET = __element_type_of_t<_Arg>; - __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); - return __effective_lowest<_ET, _StaticBounds>(__arg.__runtime_bounds_); + using _ET = __element_type_of_t<_Arg>; + const auto& __runtime_bounds = __access::__runtime_bounds(__arg); + __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds); + return __effective_lowest<_ET, _StaticBounds>(__runtime_bounds); } template -[[nodiscard]] _CCCL_API constexpr auto __lowest_(__deferred<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __lowest_(deferred<_Arg, _StaticBounds> __arg) noexcept { - using _ET = __element_type_of_t<_Arg>; - __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); - return __effective_lowest<_ET, _StaticBounds>(__arg.__runtime_bounds_); + using _ET = __element_type_of_t<_Arg>; + const auto& __runtime_bounds = __access::__runtime_bounds(__arg); + __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds); + return __effective_lowest<_ET, _StaticBounds>(__runtime_bounds); } template -[[nodiscard]] _CCCL_API constexpr auto __lowest_(__deferred_sequence<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __lowest_(deferred_sequence<_Arg, _StaticBounds> __arg) noexcept { - using _ET = __element_type_of_t<_Arg>; - __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); - return __effective_lowest<_ET, _StaticBounds>(__arg.__runtime_bounds_); + using _ET = __element_type_of_t<_Arg>; + const auto& __runtime_bounds = __access::__runtime_bounds(__arg); + __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds); + return __effective_lowest<_ET, _StaticBounds>(__runtime_bounds); } //! @brief Returns the effective highest bound, combining static and runtime bounds. @@ -701,10 +943,10 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cv_t<_Tp>>) ) return (::cuda::std::numeric_limits<__element_type_of_t<_Tp>>::max)(); } -template -[[nodiscard]] _CCCL_API constexpr auto __highest_(__constant<_Value>) noexcept +template +[[nodiscard]] _CCCL_API constexpr auto __highest_(constant<_Value, _Tp>) noexcept { - return __constant_compute_highest<_Value>(); + return __constant_compute_highest<_Value, _Tp>(); } template @@ -714,33 +956,36 @@ template } template -[[nodiscard]] _CCCL_API constexpr auto __highest_(__immediate<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __highest_(immediate<_Arg, _StaticBounds> __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template [[nodiscard]] _CCCL_API constexpr auto __highest_(__immediate_sequence<_Arg, _StaticBounds> __arg) noexcept { - using _ET = __element_type_of_t<_Arg>; - __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); - return __effective_highest<_ET, _StaticBounds>(__arg.__runtime_bounds_); + using _ET = __element_type_of_t<_Arg>; + const auto& __runtime_bounds = __access::__runtime_bounds(__arg); + __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds); + return __effective_highest<_ET, _StaticBounds>(__runtime_bounds); } template -[[nodiscard]] _CCCL_API constexpr auto __highest_(__deferred<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __highest_(deferred<_Arg, _StaticBounds> __arg) noexcept { - using _ET = __element_type_of_t<_Arg>; - __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); - return __effective_highest<_ET, _StaticBounds>(__arg.__runtime_bounds_); + using _ET = __element_type_of_t<_Arg>; + const auto& __runtime_bounds = __access::__runtime_bounds(__arg); + __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds); + return __effective_highest<_ET, _StaticBounds>(__runtime_bounds); } template -[[nodiscard]] _CCCL_API constexpr auto __highest_(__deferred_sequence<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __highest_(deferred_sequence<_Arg, _StaticBounds> __arg) noexcept { - using _ET = __element_type_of_t<_Arg>; - __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); - return __effective_highest<_ET, _StaticBounds>(__arg.__runtime_bounds_); + using _ET = __element_type_of_t<_Arg>; + const auto& __runtime_bounds = __access::__runtime_bounds(__arg); + __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds); + return __effective_highest<_ET, _StaticBounds>(__runtime_bounds); } _CCCL_END_NAMESPACE_CUDA_ARGUMENT diff --git a/libcudacxx/include/cuda/__argument/argument_bounds.h b/libcudacxx/include/cuda/__argument/argument_bounds.h index 46dd22c370c..ce6d3a56743 100644 --- a/libcudacxx/include/cuda/__argument/argument_bounds.h +++ b/libcudacxx/include/cuda/__argument/argument_bounds.h @@ -22,11 +22,8 @@ #endif // no system header #include -#include #include -#include #include -#include #include #include @@ -34,7 +31,7 @@ _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT //! @brief Sentinel type indicating no bounds are present. -struct __no_bounds +struct no_bounds {}; // ===================================================================== @@ -48,8 +45,9 @@ struct __no_bounds //! @tparam _Lower The static lower bound. //! @tparam _Upper The static upper bound. template -struct __static_bounds +class static_bounds { +public: static_assert(::cuda::std::is_same_v, "Static bounds endpoints must have the same type"); static_assert(_Lower <= _Upper, "Lower bound must be <= upper bound"); @@ -67,7 +65,7 @@ struct __static_bounds template inline constexpr bool __is_static_bounds_v = false; template -inline constexpr bool __is_static_bounds_v<__static_bounds<_Lower, _Upper>> = true; +inline constexpr bool __is_static_bounds_v> = true; // ===================================================================== // runtime_bounds @@ -77,14 +75,15 @@ inline constexpr bool __is_static_bounds_v<__static_bounds<_Lower, _Upper>> = tr //! //! @tparam _Tp The value type of the bounds. template -struct __runtime_bounds +class runtime_bounds { _Tp __lower_ = ::cuda::std::numeric_limits<_Tp>::lowest(); _Tp __upper_ = (::cuda::std::numeric_limits<_Tp>::max)(); - constexpr __runtime_bounds() noexcept = default; +public: + constexpr runtime_bounds() noexcept = default; - _CCCL_API constexpr __runtime_bounds(_Tp __lower, _Tp __upper) noexcept + _CCCL_API constexpr runtime_bounds(_Tp __lower, _Tp __upper) noexcept : __lower_(__lower) , __upper_(__upper) { @@ -104,13 +103,13 @@ struct __runtime_bounds #ifndef _CCCL_DOXYGEN_INVOKED template -_CCCL_HOST_DEVICE __runtime_bounds(_Tp, _Tp) -> __runtime_bounds<_Tp>; +_CCCL_HOST_DEVICE runtime_bounds(_Tp, _Tp) -> runtime_bounds<_Tp>; #endif // _CCCL_DOXYGEN_INVOKED template inline constexpr bool __is_runtime_bounds_v = false; template -inline constexpr bool __is_runtime_bounds_v<__runtime_bounds<_Tp>> = true; +inline constexpr bool __is_runtime_bounds_v> = true; // ===================================================================== // bounds — factory functions @@ -122,7 +121,7 @@ inline constexpr bool __is_runtime_bounds_v<__runtime_bounds<_Tp>> = true; //! @tparam _Upper The static upper bound. //! @return A compile-time bounds object. template -[[nodiscard]] _CCCL_API constexpr __static_bounds<_Lower, _Upper> __bounds() noexcept +[[nodiscard]] _CCCL_API constexpr static_bounds<_Lower, _Upper> bounds() noexcept { return {}; } @@ -133,7 +132,7 @@ template //! @param __upper The runtime upper bound. //! @return A runtime bounds object. template -[[nodiscard]] _CCCL_API constexpr __runtime_bounds<_Tp> __bounds(_Tp __lower, _Tp __upper) noexcept +[[nodiscard]] _CCCL_API constexpr runtime_bounds<_Tp> bounds(_Tp __lower, _Tp __upper) noexcept { return {__lower, __upper}; } @@ -145,129 +144,6 @@ inline constexpr bool __is_runtime_bounds_cv_v = __is_runtime_bounds_v<::cuda::s template inline constexpr bool __is_bounds_v = __is_static_bounds_cv_v<_Tp> || __is_runtime_bounds_cv_v<_Tp>; -// ===================================================================== -// __assert_in_range -// ===================================================================== - -template -_CCCL_API constexpr void __assert_in_range([[maybe_unused]] _From __val) noexcept -{ - if constexpr (::cuda::std::__cccl_is_integer_v<_To> && ::cuda::std::__cccl_is_integer_v<_From>) - { - _CCCL_ASSERT(::cuda::std::in_range<_To>(__val), "runtime bound value overflows the element type"); - } -} - -template -[[nodiscard]] _CCCL_API constexpr _To __runtime_bound_cast(_From __val) noexcept -{ - __assert_in_range<_To>(__val); - return static_cast<_To>(__val); -} - -template -_CCCL_API constexpr bool __static_bound_in_range() noexcept -{ - using _From = decltype(_Value); - - if constexpr (::cuda::std::__cccl_is_integer_v<_To> && ::cuda::std::__cccl_is_integer_v<_From>) - { - return ::cuda::std::in_range<_To>(_Value); - } - else if constexpr (::cuda::std::is_arithmetic_v<_To> && ::cuda::std::is_arithmetic_v<_From>) - { - return static_cast<_From>(static_cast<_To>(_Value)) == _Value; - } - else - { - return true; - } -} - -template -inline constexpr bool __valid_static_bounds_v = true; - -template -inline constexpr bool __valid_static_bounds_v<_ElementType, __static_bounds<_Lowest, _Highest>> = - ::cuda::__argument::__static_bound_in_range<_ElementType, _Lowest>() - && ::cuda::__argument::__static_bound_in_range<_ElementType, _Highest>(); - -template -_CCCL_API constexpr _ElementType __wrapper_static_lowest() noexcept -{ - if constexpr (::cuda::std::is_same_v<_StaticBounds, __no_bounds>) - { - return ::cuda::std::numeric_limits<_ElementType>::lowest(); - } - else - { - return static_cast<_ElementType>(_StaticBounds::lower()); - } -} - -template -_CCCL_API constexpr _ElementType __wrapper_static_highest() noexcept -{ - if constexpr (::cuda::std::is_same_v<_StaticBounds, __no_bounds>) - { - return (::cuda::std::numeric_limits<_ElementType>::max)(); - } - else - { - return static_cast<_ElementType>(_StaticBounds::upper()); - } -} - -template -_CCCL_API constexpr _ElementType __effective_lowest(__runtime_bounds<_ElementType> __runtime_bounds) noexcept -{ - auto __static_lowest = ::cuda::__argument::__wrapper_static_lowest<_ElementType, _StaticBounds>(); - return __static_lowest > __runtime_bounds.lower() ? __static_lowest : __runtime_bounds.lower(); -} - -template -_CCCL_API constexpr _ElementType __effective_highest(__runtime_bounds<_ElementType> __runtime_bounds) noexcept -{ - auto __static_highest = ::cuda::__argument::__wrapper_static_highest<_ElementType, _StaticBounds>(); - return __static_highest < __runtime_bounds.upper() ? __static_highest : __runtime_bounds.upper(); -} - -template -_CCCL_API constexpr bool __valid_argument_bounds(__runtime_bounds<_ElementType> __runtime_bounds) noexcept -{ - return ::cuda::__argument::__effective_lowest<_ElementType, _StaticBounds>(__runtime_bounds) - <= ::cuda::__argument::__effective_highest<_ElementType, _StaticBounds>(__runtime_bounds); -} - -template -_CCCL_API constexpr void __validate_bounds_intersection(__runtime_bounds<_ElementType> __runtime_bounds) noexcept -{ - static_assert(__valid_static_bounds_v<_ElementType, _StaticBounds>, - "static argument bounds cannot be represented by the element type"); - _CCCL_VERIFY((::cuda::__argument::__valid_argument_bounds<_ElementType, _StaticBounds>(__runtime_bounds)), - "static and runtime argument bounds do not intersect"); -} - -template -_CCCL_API constexpr void __validate_static_element_bounds([[maybe_unused]] const _ElementType& __val) noexcept -{ - if constexpr (!::cuda::std::is_same_v<_StaticBounds, __no_bounds>) - { - _CCCL_ASSERT((__val >= ::cuda::__argument::__wrapper_static_lowest<_ElementType, _StaticBounds>()), - "immediate argument value is below static lowest bound"); - _CCCL_ASSERT((__val <= ::cuda::__argument::__wrapper_static_highest<_ElementType, _StaticBounds>()), - "immediate argument value is above static highest bound"); - } -} - -template -_CCCL_API constexpr void __validate_runtime_element_bounds( - [[maybe_unused]] const _ElementType& __val, [[maybe_unused]] __runtime_bounds<_ElementType> __runtime_bounds) noexcept -{ - _CCCL_ASSERT((__val >= __runtime_bounds.lower()), "immediate argument value is below runtime lower bound"); - _CCCL_ASSERT((__val <= __runtime_bounds.upper()), "immediate argument value is above runtime upper bound"); -} - _CCCL_END_NAMESPACE_CUDA_ARGUMENT #include diff --git a/libcudacxx/include/cuda/argument b/libcudacxx/include/cuda/argument new file mode 100644 index 00000000000..bcb079e7e5e --- /dev/null +++ b/libcudacxx/include/cuda/argument @@ -0,0 +1,26 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_ARGUMENT_ +#define _CUDA_ARGUMENT_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#endif // _CUDA_ARGUMENT_ diff --git a/libcudacxx/include/cuda/std/__internal/namespaces.h b/libcudacxx/include/cuda/std/__internal/namespaces.h index 325830c17cf..4876dd6f449 100644 --- a/libcudacxx/include/cuda/std/__internal/namespaces.h +++ b/libcudacxx/include/cuda/std/__internal/namespaces.h @@ -115,8 +115,8 @@ #define _CCCL_BEGIN_NAMESPACE_CUDA_EXECUTION _CCCL_BEGIN_NAMESPACE(cuda::execution) #define _CCCL_END_NAMESPACE_CUDA_EXECUTION _CCCL_END_NAMESPACE(cuda::execution) -#define _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT _CCCL_BEGIN_NAMESPACE(cuda::__argument) -#define _CCCL_END_NAMESPACE_CUDA_ARGUMENT _CCCL_END_NAMESPACE(cuda::__argument) +#define _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT _CCCL_BEGIN_NAMESPACE(cuda::args) +#define _CCCL_END_NAMESPACE_CUDA_ARGUMENT _CCCL_END_NAMESPACE(cuda::args) // Namespace to avoid name collisions with CPOs on clang-16 (see // https://godbolt.org/z/9TadonrdM for example). MSVC's ancient parser also gets confused with diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp index d7f394a0d74..46070dbabbd 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp @@ -8,7 +8,7 @@ // //===----------------------------------------------------------------------===// -#include +#include #include #include @@ -16,31 +16,31 @@ TEST_FUNC constexpr bool test() { - // --- static_argument_bounds --- + // --- static_bounds --- // Basic static bounds { - constexpr auto b = cuda::__argument::__static_bounds<1, 4096>{}; + constexpr auto b = cuda::args::static_bounds<1, 4096>{}; static_assert(b.lower() == 1); static_assert(b.upper() == 4096); } // Exact static bounds { - constexpr auto b = cuda::__argument::__static_bounds<42, 42>{}; + constexpr auto b = cuda::args::static_bounds<42, 42>{}; static_assert(b.lower() == 42); static_assert(b.upper() == 42); } // Long type deduced from NTTPs { - static_assert(cuda::std::is_same_v::lower()), long>); + static_assert(cuda::std::is_same_v::lower()), long>); } #if TEST_HAS_CLASS_NTTP // Static bounds preserve their original NTTP types { - constexpr auto b = cuda::__argument::__bounds<1.0f, 8.0f>(); + constexpr auto b = cuda::args::bounds<1.0f, 8.0f>(); static_assert(b.lower() == 1.0f); static_assert(b.upper() == 8); static_assert(cuda::std::is_same_v); @@ -48,11 +48,11 @@ TEST_FUNC constexpr bool test() } #endif // TEST_HAS_CLASS_NTTP - // --- runtime_argument_bounds --- + // --- runtime_bounds --- // Basic runtime bounds { - auto b = cuda::__argument::__runtime_bounds{10, 100}; + auto b = cuda::args::runtime_bounds{10, 100}; assert(b.lower() == 10); assert(b.upper() == 100); static_assert(cuda::std::is_same_v); @@ -62,35 +62,43 @@ TEST_FUNC constexpr bool test() // Static via factory { - constexpr auto b = cuda::__argument::__bounds<1, 8>(); + constexpr auto b = cuda::args::bounds<1, 8>(); static_assert(b.lower() == 1); static_assert(b.upper() == 8); - static_assert(cuda::__argument::__is_static_bounds_cv_v); - static_assert(!cuda::__argument::__is_runtime_bounds_cv_v); - static_assert(cuda::__argument::__is_bounds_v); + static_assert(cuda::args::__is_static_bounds_cv_v); + static_assert(!cuda::args::__is_runtime_bounds_cv_v); + static_assert(cuda::args::__is_bounds_v); } // Runtime via factory { - auto b = cuda::__argument::__bounds(10, 100); + auto b = cuda::args::bounds(10, 100); assert(b.lower() == 10); assert(b.upper() == 100); - static_assert(!cuda::__argument::__is_static_bounds_cv_v); - static_assert(cuda::__argument::__is_runtime_bounds_cv_v); - static_assert(cuda::__argument::__is_bounds_v); + static_assert(!cuda::args::__is_static_bounds_cv_v); + static_assert(cuda::args::__is_runtime_bounds_cv_v); + static_assert(cuda::args::__is_bounds_v); } // Static and runtime bounds intersection { - static_assert(cuda::__argument::__valid_argument_bounds>( - cuda::__argument::__runtime_bounds{50, 200})); - static_assert(!cuda::__argument::__valid_argument_bounds>( - cuda::__argument::__runtime_bounds{0, 50})); + static_assert(cuda::args::__has_bounds_intersection>( + cuda::args::runtime_bounds{50, 200})); + static_assert(!cuda::args::__has_bounds_intersection>( + cuda::args::runtime_bounds{0, 50})); } // Non-bounds type { - static_assert(!cuda::__argument::__is_bounds_v); + static_assert(!cuda::args::__is_bounds_v); + } + + // Bounds types accepted by argument wrapper template parameters + { + static_assert(cuda::args::__valid_static_bounds_v); + static_assert(cuda::args::__valid_static_bounds_v>); + static_assert(!cuda::args::__valid_static_bounds_v>); + static_assert(!cuda::args::__valid_static_bounds_v); } return true; diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp index 13753040f08..aaf57291e23 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp @@ -8,7 +8,7 @@ // //===----------------------------------------------------------------------===// -#include +#include #include #include #include @@ -54,138 +54,127 @@ struct non_sequence_value TEST_FUNC void test() { - // --- __is_sequence_v / __is_single_value_v --- + // --- __is_sequence_v --- // builtin and class type are not sequences - static_assert(!cuda::__argument::__is_sequence_v); - static_assert(!cuda::__argument::__is_sequence_v); - static_assert(!cuda::__argument::__is_sequence_v); - static_assert(!cuda::__argument::__is_sequence_v>); - static_assert(!cuda::__argument::__is_sequence_v>); - static_assert(!cuda::__argument::__is_sequence_v>); - static_assert(!cuda::__argument::__is_sequence_v>); - static_assert(!cuda::__argument::__is_sequence_v>); - static_assert(!cuda::__argument::__is_sequence_v>); - static_assert(!cuda::__argument::__is_sequence_v>); - static_assert(!cuda::__argument::__is_sequence_v>); + static_assert(!cuda::args::__is_sequence_v); + static_assert(!cuda::args::__is_sequence_v); + static_assert(!cuda::args::__is_sequence_v); + static_assert(!cuda::args::__is_sequence_v>); + static_assert(!cuda::args::__is_sequence_v>); + static_assert(!cuda::args::__is_sequence_v>); + static_assert(!cuda::args::__is_sequence_v>); + static_assert(!cuda::args::__is_sequence_v>); + static_assert(!cuda::args::__is_sequence_v>); + static_assert(!cuda::args::__is_sequence_v>); + static_assert(!cuda::args::__is_sequence_v>); // iterators and pointers can be sequences if they are at least random access - static_assert(cuda::__argument::__is_sequence_v); - static_assert(cuda::__argument::__is_sequence_v); - static_assert(cuda::__argument::__is_sequence_v>); - static_assert(!cuda::__argument::__is_sequence_v>); + static_assert(cuda::args::__is_sequence_v); + static_assert(cuda::args::__is_sequence_v); + static_assert(cuda::args::__is_sequence_v>); + static_assert(!cuda::args::__is_sequence_v>); // ranges and arrays are sequences - static_assert(cuda::__argument::__is_sequence_v); - static_assert(cuda::__argument::__is_sequence_v); - static_assert(cuda::__argument::__is_sequence_v); - static_assert(cuda::__argument::__is_sequence_v); - static_assert(cuda::__argument::__is_sequence_v>); - static_assert(cuda::__argument::__is_sequence_v&>); - static_assert(cuda::__argument::__is_sequence_v>); - static_assert(cuda::__argument::__is_sequence_v>); + static_assert(cuda::args::__is_sequence_v); + static_assert(cuda::args::__is_sequence_v); + static_assert(cuda::args::__is_sequence_v); + static_assert(cuda::args::__is_sequence_v); + static_assert(cuda::args::__is_sequence_v>); + static_assert(cuda::args::__is_sequence_v&>); + static_assert(cuda::args::__is_sequence_v>); + static_assert(cuda::args::__is_sequence_v>); // --- __element_type_of_t --- - static_assert(cuda::std::is_same_v&>, int>); - static_assert(cuda::std::is_same_v, int>); - static_assert(cuda::std::is_same_v>, int>); - static_assert(cuda::std::is_same_v>, int>); - static_assert(cuda::std::is_same_v>, int>); - static_assert(cuda::std::is_same_v>, int>); + static_assert(cuda::std::is_same_v&>, int>); + static_assert(cuda::std::is_same_v, int>); + static_assert(cuda::std::is_same_v>, int>); + static_assert(cuda::std::is_same_v>, int>); + static_assert(cuda::std::is_same_v>, int>); + static_assert(cuda::std::is_same_v>, int>); static_assert( - cuda::std::is_same_v>>, - int>); - static_assert(cuda::std::is_same_v>, int>); + cuda::std::is_same_v>>, int>); + static_assert(cuda::std::is_same_v>, int>); // --- argument_traits: is_deferred --- - static_assert(!cuda::__argument::__traits::is_deferred); - static_assert(!cuda::__argument::__traits>::is_deferred); - static_assert(!cuda::__argument::__traits>>::is_deferred); - static_assert(!cuda::__argument::__traits>::is_deferred); + static_assert(!cuda::args::__traits::is_deferred); + static_assert(!cuda::args::__traits>::is_deferred); + static_assert(!cuda::args::__traits>>::is_deferred); + static_assert(!cuda::args::__traits>::is_deferred); #if TEST_HAS_CLASS_NTTP - static_assert( - !cuda::__argument::__traits{1, 2, 3}>>::is_deferred); + static_assert(!cuda::args::__traits{1, 2, 3}>>::is_deferred); #endif // TEST_HAS_CLASS_NTTP - static_assert(cuda::__argument::__traits>>::is_deferred); - static_assert(cuda::__argument::__traits>>::is_deferred); + static_assert(cuda::args::__traits>>::is_deferred); + static_assert(cuda::args::__traits>>::is_deferred); // --- argument_traits: is_single_value --- - static_assert(cuda::__argument::__traits::is_single_value); - static_assert(cuda::__argument::__traits::is_single_value); - static_assert(cuda::__argument::__traits>::is_single_value); - static_assert(cuda::__argument::__traits>::is_single_value); - static_assert( - cuda::__argument::__traits>>::is_single_value); - static_assert( - !cuda::__argument::__traits>>::is_single_value); - static_assert(cuda::__argument::__traits>::is_single_value); + static_assert(cuda::args::__traits::is_single_value); + static_assert(cuda::args::__traits::is_single_value); + static_assert(cuda::args::__traits>::is_single_value); + static_assert(cuda::args::__traits>::is_single_value); + static_assert(cuda::args::__traits>>::is_single_value); + static_assert(!cuda::args::__traits>>::is_single_value); + static_assert(cuda::args::__traits>::is_single_value); #if TEST_HAS_CLASS_NTTP - static_assert(!cuda::__argument::__traits< - cuda::__argument::__constant_sequence{1, 2, 3}>>::is_single_value); -#endif // TEST_HAS_CLASS_NTTP - static_assert(cuda::__argument::__traits>::is_single_value); static_assert( - !cuda::__argument::__traits>>::is_single_value); + !cuda::args::__traits{1, 2, 3}>>::is_single_value); +#endif // TEST_HAS_CLASS_NTTP + static_assert(cuda::args::__traits>::is_single_value); + static_assert(!cuda::args::__traits>>::is_single_value); // --- argument_traits: value_type --- - static_assert(cuda::std::is_same_v::value_type, int>); - static_assert(cuda::std::is_same_v>::value_type, int>); - static_assert(cuda::std::is_same_v< - cuda::__argument::__traits>>::value_type, - cuda::std::span>); - static_assert(cuda::std::is_same_v>::value_type, int>); -#if TEST_HAS_CLASS_NTTP + static_assert(cuda::std::is_same_v::value_type, int>); + static_assert(cuda::std::is_same_v>::value_type, int>); static_assert( - cuda::std::is_same_v< - cuda::__argument::__traits{1, 2, 3}>>::value_type, - cuda::std::array>); + cuda::std::is_same_v>>::value_type, + cuda::std::span>); + static_assert(cuda::std::is_same_v>::value_type, int>); + static_assert(cuda::std::is_same_v>::value_type, float>); +#if TEST_HAS_CLASS_NTTP + static_assert(cuda::std::is_same_v< + cuda::args::__traits{1, 2, 3}>>::value_type, + cuda::std::array>); #endif // TEST_HAS_CLASS_NTTP // --- argument_traits: lowest / highest --- - static_assert(cuda::__argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__traits::highest == (cuda::std::numeric_limits::max)()); - static_assert(cuda::__argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__traits::highest == (cuda::std::numeric_limits::max)()); - static_assert(cuda::__argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__traits::highest == (cuda::std::numeric_limits::max)()); - static_assert( - cuda::__argument::__traits>>::lowest - == 1); - static_assert( - cuda::__argument::__traits>&>::highest - == 8); + static_assert(cuda::args::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__traits::highest == (cuda::std::numeric_limits::max)()); + static_assert(cuda::args::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__traits::highest == (cuda::std::numeric_limits::max)()); + static_assert(cuda::args::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__traits::highest == (cuda::std::numeric_limits::max)()); + static_assert(cuda::args::__traits>>::lowest == 1); + static_assert(cuda::args::__traits>&>::highest == 8); static_assert( - cuda::__argument::__traits< - cuda::__argument::__immediate_sequence, cuda::__argument::__static_bounds<1, 8>>>::highest + cuda::args::__traits, cuda::args::static_bounds<1, 8>>>::highest == 8); + static_assert(cuda::args::__traits>::lowest == 10.0f); + static_assert(cuda::args::__traits>::highest == 10.0f); #if TEST_HAS_CLASS_NTTP - static_assert( - cuda::__argument::__traits{3, 1, 2}>>::lowest == 1); - static_assert( - cuda::__argument::__traits{3, 1, 2}>>::highest == 3); + static_assert(cuda::args::__traits{3, 1, 2}>>::lowest == 1); + static_assert(cuda::args::__traits{3, 1, 2}>>::highest == 3); #endif // TEST_HAS_CLASS_NTTP // --- Free function bounds on plain values --- - static_assert(cuda::__argument::__lowest_(42) == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__highest_(42) == (cuda::std::numeric_limits::max)()); - static_assert(cuda::__argument::__lowest_(1.0f) == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__highest_(1.0f) == (cuda::std::numeric_limits::max)()); + static_assert(cuda::args::__lowest_(42) == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__highest_(42) == (cuda::std::numeric_limits::max)()); + static_assert(cuda::args::__lowest_(1.0f) == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__highest_(1.0f) == (cuda::std::numeric_limits::max)()); // --- Scalar and sequence wrappers expose distinct single-value traits --- - static_assert(cuda::__argument::__traits>::is_single_value); - static_assert(cuda::__argument::__traits>::is_single_value); - static_assert( - !cuda::__argument::__traits>>::is_single_value); + static_assert(cuda::args::__traits>::is_single_value); + static_assert(cuda::args::__traits>::is_single_value); + static_assert(!cuda::args::__traits>>::is_single_value); #if TEST_HAS_CLASS_NTTP - static_assert(!cuda::__argument::__traits< - cuda::__argument::__constant_sequence{1, 2, 3}>>::is_single_value); + static_assert( + !cuda::args::__traits{1, 2, 3}>>::is_single_value); #endif // TEST_HAS_CLASS_NTTP } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp index 9949e0013b4..21b200e8f0f 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp @@ -8,7 +8,7 @@ // //===----------------------------------------------------------------------===// -#include +#include #include #include #include @@ -22,102 +22,110 @@ TEST_FUNC constexpr bool test() // Deferred single value via span { int val = 42; - auto def = cuda::__argument::__deferred{cuda::std::span{&val, 1}}; - assert(cuda::__argument::__unwrap(def)[0] == 42); - static_assert(cuda::__argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__traits::highest == (cuda::std::numeric_limits::max)()); + auto def = cuda::args::deferred{cuda::std::span{&val, 1}}; + assert(cuda::args::__unwrap(def)[0] == 42); + assert(cuda::args::__access::__arg(def)[0] == 42); + static_assert(cuda::args::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__traits::highest == (cuda::std::numeric_limits::max)()); } // Deferred single value with static bounds { int val = 42; - auto def = cuda::__argument::__deferred{cuda::std::span{&val, 1}, cuda::__argument::__bounds<1, 1000>()}; - assert(cuda::__argument::__unwrap(def)[0] == 42); - static_assert(cuda::__argument::__traits::lowest == 1); - static_assert(cuda::__argument::__traits::highest == 1000); + auto def = cuda::args::deferred{cuda::std::span{&val, 1}, cuda::args::bounds<1, 1000>()}; + assert(cuda::args::__unwrap(def)[0] == 42); + static_assert(cuda::args::__traits::lowest == 1); + static_assert(cuda::args::__traits::highest == 1000); } // Deferred single value via pointer { int val = 42; - using def_t = cuda::__argument::__deferred>; - static_assert(cuda::__argument::__traits::lowest == 0); - static_assert(cuda::__argument::__traits::highest == 100); + using def_t = cuda::args::deferred>; + static_assert(cuda::args::__traits::lowest == 0); + static_assert(cuda::args::__traits::highest == 100); // Also verify construction works - auto def = cuda::__argument::__deferred{&val, cuda::__argument::__bounds<0, 100>()}; - assert(cuda::__argument::__unwrap(def) == &val); + auto def = cuda::args::deferred{&val, cuda::args::bounds<0, 100>()}; + assert(cuda::args::__unwrap(def) == &val); } // Deferred single value via fancy iterator { auto it = cuda::counting_iterator{42}; - auto def = cuda::__argument::__deferred{it, cuda::__argument::__bounds<0, 100>()}; - assert(cuda::__argument::__unwrap(def)[0] == 42); - static_assert(cuda::__argument::__traits::lowest == 0); - static_assert(cuda::__argument::__traits::highest == 100); - static_assert(cuda::__argument::__traits::is_single_value); + auto def = cuda::args::deferred{it, cuda::args::bounds<0, 100>()}; + assert(cuda::args::__unwrap(def)[0] == 42); + static_assert(cuda::args::__traits::lowest == 0); + static_assert(cuda::args::__traits::highest == 100); + static_assert(cuda::args::__traits::is_single_value); } // Deferred single value with both bounds, runtime bounds first { - int val = 42; - auto def = cuda::__argument::__deferred{ - cuda::std::span{&val, 1}, cuda::__argument::__bounds(5, 100), cuda::__argument::__bounds<1, 256>()}; - static_assert(cuda::__argument::__traits::lowest == 1); - static_assert(cuda::__argument::__traits::highest == 256); - assert(cuda::__argument::__lowest_(def) == 5); - assert(cuda::__argument::__highest_(def) == 100); + int val = 42; + auto def = + cuda::args::deferred{cuda::std::span{&val, 1}, cuda::args::bounds(5, 100), cuda::args::bounds<1, 256>()}; + static_assert(cuda::args::__traits::lowest == 1); + static_assert(cuda::args::__traits::highest == 256); + assert(cuda::args::__access::__runtime_bounds(def).lower() == 5); + assert(cuda::args::__access::__runtime_bounds(def).upper() == 100); + assert(cuda::args::__lowest_(def) == 5); + assert(cuda::args::__highest_(def) == 100); + cuda::args::__access::__runtime_bounds(def) = cuda::args::bounds(5, 90); + assert(cuda::args::__highest_(def) == 90); } // Deferred sequence via fancy iterator { auto it = cuda::counting_iterator{10}; - auto def = cuda::__argument::__deferred_sequence{it, cuda::__argument::__bounds<0, 100>()}; - assert(cuda::__argument::__unwrap(def)[0] == 10); - assert(cuda::__argument::__unwrap(def)[2] == 12); - static_assert(cuda::__argument::__traits::lowest == 0); - static_assert(cuda::__argument::__traits::highest == 100); - static_assert(!cuda::__argument::__traits::is_single_value); + auto def = cuda::args::deferred_sequence{it, cuda::args::bounds<0, 100>()}; + assert(cuda::args::__unwrap(def)[0] == 10); + assert(cuda::args::__unwrap(def)[2] == 12); + static_assert(cuda::args::__traits::lowest == 0); + static_assert(cuda::args::__traits::highest == 100); + static_assert(!cuda::args::__traits::is_single_value); } // Deferred sequence with both bounds { int arr[4] = {10, 20, 30, 40}; - auto def = cuda::__argument::__deferred_sequence{ - cuda::std::span{arr, 4}, cuda::__argument::__bounds<1, 4096>(), cuda::__argument::__bounds(5, 100)}; - static_assert(cuda::__argument::__traits::lowest == 1); - assert(cuda::__argument::__lowest_(def) == 5); - assert(cuda::__argument::__highest_(def) == 100); + auto def = cuda::args::deferred_sequence{ + cuda::std::span{arr, 4}, cuda::args::bounds<1, 4096>(), cuda::args::bounds(5, 100)}; + assert(cuda::args::__access::__arg(def).size() == 4); + assert(cuda::args::__access::__runtime_bounds(def).lower() == 5); + assert(cuda::args::__access::__runtime_bounds(def).upper() == 100); + static_assert(cuda::args::__traits::lowest == 1); + assert(cuda::args::__lowest_(def) == 5); + assert(cuda::args::__highest_(def) == 100); } // Deferred sequence with both bounds, runtime bounds first { int arr[4] = {10, 20, 30, 40}; - auto def = cuda::__argument::__deferred_sequence{ - cuda::std::span{arr, 4}, cuda::__argument::__bounds(5, 100), cuda::__argument::__bounds<1, 4096>()}; - static_assert(cuda::__argument::__traits::lowest == 1); - static_assert(cuda::__argument::__traits::highest == 4096); - assert(cuda::__argument::__lowest_(def) == 5); - assert(cuda::__argument::__highest_(def) == 100); + auto def = cuda::args::deferred_sequence{ + cuda::std::span{arr, 4}, cuda::args::bounds(5, 100), cuda::args::bounds<1, 4096>()}; + static_assert(cuda::args::__traits::lowest == 1); + static_assert(cuda::args::__traits::highest == 4096); + assert(cuda::args::__lowest_(def) == 5); + assert(cuda::args::__highest_(def) == 100); } // Traits: deferred is single value { - using traits = cuda::__argument::__traits>>; + using traits = cuda::args::__traits>>; static_assert(traits::is_deferred); static_assert(traits::is_single_value); } // Traits: deferred with pointer is also single value { - using traits = cuda::__argument::__traits>; + using traits = cuda::args::__traits>; static_assert(traits::is_deferred); static_assert(traits::is_single_value); } // Traits: deferred_sequence is not single value { - using traits = cuda::__argument::__traits>>; + using traits = cuda::args::__traits>>; static_assert(traits::is_deferred); static_assert(!traits::is_single_value); } @@ -125,16 +133,16 @@ TEST_FUNC constexpr bool test() // Unwrap: deferred { int val = 99; - auto def = cuda::__argument::__deferred{cuda::std::span{&val, 1}}; - auto& v = cuda::__argument::__unwrap(def); + auto def = cuda::args::deferred{cuda::std::span{&val, 1}}; + auto& v = cuda::args::__unwrap(def); assert(v[0] == 99); } // Unwrap: deferred_sequence { int arr[3] = {10, 20, 30}; - auto def = cuda::__argument::__deferred_sequence{cuda::std::span{arr, 3}}; - const auto& v = cuda::__argument::__unwrap(def); + auto def = cuda::args::deferred_sequence{cuda::std::span{arr, 3}}; + const auto& v = cuda::args::__unwrap(def); assert(v.size() == 3); assert(v[1] == 20); } @@ -142,14 +150,14 @@ TEST_FUNC constexpr bool test() // Unwrap: rvalue deferred returns by value { int val = 99; - auto v = cuda::__argument::__unwrap(cuda::__argument::__deferred{cuda::std::span{&val, 1}}); + auto v = cuda::args::__unwrap(cuda::args::deferred{cuda::std::span{&val, 1}}); assert(v[0] == 99); } // Unwrap: rvalue deferred_sequence returns by value { int arr[3] = {10, 20, 30}; - auto v = cuda::__argument::__unwrap(cuda::__argument::__deferred_sequence{cuda::std::span{arr, 3}}); + auto v = cuda::args::__unwrap(cuda::args::deferred_sequence{cuda::std::span{arr, 3}}); assert(v.size() == 3); assert(v[2] == 30); } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp new file mode 100644 index 00000000000..64bad620293 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp @@ -0,0 +1,18 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +[[maybe_unused]] cuda::args::deferred_sequence invalid_arg{0}; + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp new file mode 100644 index 00000000000..111bc226ae5 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using traits = cuda::args::__traits>; + +[[maybe_unused]] constexpr bool invalid_traits = traits::is_deferred; + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp index c9723304774..7970c50e2df 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp @@ -8,7 +8,7 @@ // //===----------------------------------------------------------------------===// -#include +#include #include #include #include @@ -25,93 +25,99 @@ TEST_FUNC constexpr bool test() { // Uniform scalar via CTAD { - auto da = cuda::__argument::__immediate{5}; - assert(cuda::__argument::__unwrap(da) == 5); - static_assert(cuda::__argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__traits::highest == (cuda::std::numeric_limits::max)()); - assert(cuda::__argument::__lowest_(da) == 5); - assert(cuda::__argument::__highest_(da) == 5); + auto da = cuda::args::immediate{5}; + assert(cuda::args::__unwrap(da) == 5); + assert(cuda::args::__access::__arg(da) == 5); + static_assert(cuda::args::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__traits::highest == (cuda::std::numeric_limits::max)()); + assert(cuda::args::__lowest_(da) == 5); + assert(cuda::args::__highest_(da) == 5); + cuda::args::__access::__arg(da) = 6; + assert(cuda::args::__unwrap(da) == 6); } // Uniform scalar with static bounds { - auto da = cuda::__argument::__immediate{5, cuda::__argument::__bounds<1, 8>()}; - assert(cuda::__argument::__unwrap(da) == 5); - static_assert(cuda::__argument::__traits::lowest == 1); - static_assert(cuda::__argument::__traits::highest == 8); - assert(cuda::__argument::__lowest_(da) == 5); - assert(cuda::__argument::__highest_(da) == 5); + auto da = cuda::args::immediate{5, cuda::args::bounds<1, 8>()}; + assert(cuda::args::__unwrap(da) == 5); + static_assert(cuda::args::__traits::lowest == 1); + static_assert(cuda::args::__traits::highest == 8); + assert(cuda::args::__lowest_(da) == 5); + assert(cuda::args::__highest_(da) == 5); } // Non-sequence values are accepted without scalar-only restrictions { - auto da = cuda::__argument::__immediate{non_sequence_value{7}}; - assert(cuda::__argument::__unwrap(da).payload == 7); + auto da = cuda::args::immediate{non_sequence_value{7}}; + assert(cuda::args::__unwrap(da).payload == 7); } // Pointer-like types can still represent a single value when explicitly wrapped that way { int value = 11; - auto da = cuda::__argument::__immediate{&value}; - static_assert(cuda::__argument::__traits::is_single_value); - assert(*cuda::__argument::__unwrap(da) == 11); + auto da = cuda::args::immediate{&value}; + static_assert(cuda::args::__traits::is_single_value); + assert(*cuda::args::__unwrap(da) == 11); } // Per-segment span with runtime bounds { int arr[4] = {10, 20, 30, 40}; - auto da = - cuda::__argument::__immediate_sequence{cuda::std::span{arr, 4}, cuda::__argument::__bounds(1L, 100L)}; - assert(cuda::__argument::__unwrap(da).size() == 4); - assert(cuda::__argument::__lowest_(da) == 1); - assert(cuda::__argument::__highest_(da) == 100); + auto da = cuda::args::__immediate_sequence{cuda::std::span{arr, 4}, cuda::args::bounds(1L, 100L)}; + assert(cuda::args::__unwrap(da).size() == 4); + assert(cuda::args::__access::__arg(da).size() == 4); + assert(cuda::args::__access::__runtime_bounds(da).lower() == 1); + assert(cuda::args::__access::__runtime_bounds(da).upper() == 100); + assert(cuda::args::__lowest_(da) == 1); + assert(cuda::args::__highest_(da) == 100); + cuda::args::__access::__runtime_bounds(da) = cuda::args::bounds(1, 90); + assert(cuda::args::__highest_(da) == 90); } // Per-segment span with both bounds { int arr[4] = {10, 20, 30, 40}; - auto da = cuda::__argument::__immediate_sequence{ - cuda::std::span{arr, 4}, cuda::__argument::__bounds<1, 256>(), cuda::__argument::__bounds(10, 200)}; - static_assert(cuda::__argument::__traits::lowest == 1); - static_assert(cuda::__argument::__traits::highest == 256); - assert(cuda::__argument::__lowest_(da) == 10); - assert(cuda::__argument::__highest_(da) == 200); + auto da = cuda::args::__immediate_sequence{ + cuda::std::span{arr, 4}, cuda::args::bounds<1, 256>(), cuda::args::bounds(10, 200)}; + static_assert(cuda::args::__traits::lowest == 1); + static_assert(cuda::args::__traits::highest == 256); + assert(cuda::args::__lowest_(da) == 10); + assert(cuda::args::__highest_(da) == 200); } // Per-segment span with both bounds, runtime bounds first { int arr[4] = {10, 20, 30, 40}; - auto da = cuda::__argument::__immediate_sequence{ - cuda::std::span{arr, 4}, cuda::__argument::__bounds(10, 200), cuda::__argument::__bounds<1, 256>()}; - static_assert(cuda::__argument::__traits::lowest == 1); - static_assert(cuda::__argument::__traits::highest == 256); - assert(cuda::__argument::__lowest_(da) == 10); - assert(cuda::__argument::__highest_(da) == 200); + auto da = cuda::args::__immediate_sequence{ + cuda::std::span{arr, 4}, cuda::args::bounds(10, 200), cuda::args::bounds<1, 256>()}; + static_assert(cuda::args::__traits::lowest == 1); + static_assert(cuda::args::__traits::highest == 256); + assert(cuda::args::__lowest_(da) == 10); + assert(cuda::args::__highest_(da) == 200); } // Per-segment via span { int arr[4] = {1, 2, 3, 4}; - auto da = cuda::__argument::__immediate_sequence{cuda::std::span{arr, 4}}; - assert(cuda::__argument::__unwrap(da).size() == 4); - assert(cuda::__argument::__unwrap(da)[0] == 1); - assert(cuda::__argument::__unwrap(da)[3] == 4); + auto da = cuda::args::__immediate_sequence{cuda::std::span{arr, 4}}; + assert(cuda::args::__unwrap(da).size() == 4); + assert(cuda::args::__unwrap(da)[0] == 1); + assert(cuda::args::__unwrap(da)[3] == 4); } // Per-segment with static bounds { int arr[4] = {10, 20, 30, 40}; - auto da = - cuda::__argument::__immediate_sequence{cuda::std::span{arr, 4}, cuda::__argument::__bounds<1, 100>()}; - assert(cuda::__argument::__unwrap(da).size() == 4); - assert(cuda::__argument::__unwrap(da)[2] == 30); - static_assert(cuda::__argument::__traits::lowest == 1); - static_assert(cuda::__argument::__traits::highest == 100); + auto da = cuda::args::__immediate_sequence{cuda::std::span{arr, 4}, cuda::args::bounds<1, 100>()}; + assert(cuda::args::__unwrap(da).size() == 4); + assert(cuda::args::__unwrap(da)[2] == 30); + static_assert(cuda::args::__traits::lowest == 1); + static_assert(cuda::args::__traits::highest == 100); } // Traits { - using traits = cuda::__argument::__traits>; + using traits = cuda::args::__traits>; static_assert(!traits::is_deferred); static_assert(traits::is_single_value); static_assert(cuda::std::is_same_v); @@ -119,48 +125,46 @@ TEST_FUNC constexpr bool test() // Sequence traits { - using traits = cuda::__argument::__traits>>; + using traits = cuda::args::__traits>>; static_assert(!traits::is_deferred); static_assert(!traits::is_single_value); static_assert(cuda::std::is_same_v>); } - // __is_single_value_v on unwrapped types + // __is_sequence_v on unwrapped types { - static_assert( - !cuda::__argument::__is_sequence_v>::value_type>); - static_assert( - !cuda::__argument::__traits>>::is_single_value); + static_assert(!cuda::args::__is_sequence_v>::value_type>); + static_assert(!cuda::args::__traits>>::is_single_value); } // Unwrap: scalar { - auto da = cuda::__argument::__immediate{7}; - auto& v = cuda::__argument::__unwrap(da); + auto da = cuda::args::immediate{7}; + auto& v = cuda::args::__unwrap(da); assert(v == 7); v = 8; - assert(cuda::__argument::__unwrap(da) == 8); + assert(cuda::args::__unwrap(da) == 8); } // Unwrap: span { int arr[3] = {10, 20, 30}; - auto da = cuda::__argument::__immediate_sequence{cuda::std::span{arr, 3}}; - const auto& v = cuda::__argument::__unwrap(da); + auto da = cuda::args::__immediate_sequence{cuda::std::span{arr, 3}}; + const auto& v = cuda::args::__unwrap(da); assert(v.size() == 3); assert(v[1] == 20); } // Unwrap: rvalue scalar returns by value { - const auto& v = cuda::__argument::__unwrap(cuda::__argument::__immediate{7}); + const auto& v = cuda::args::__unwrap(cuda::args::immediate{7}); assert(v == 7); } // Unwrap: rvalue span returns by value { int arr[3] = {10, 20, 30}; - auto v = cuda::__argument::__unwrap(cuda::__argument::__immediate_sequence{cuda::std::span{arr, 3}}); + auto v = cuda::args::__unwrap(cuda::args::__immediate_sequence{cuda::std::span{arr, 3}}); assert(v.size() == 3); assert(v[2] == 30); } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp index 85ccaf1c8a0..f3cc7a2a993 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp @@ -8,7 +8,7 @@ // //===----------------------------------------------------------------------===// -#include +#include #include #include #include @@ -24,72 +24,76 @@ TEST_FUNC void test() { // Basic value { - constexpr auto sa = cuda::__argument::__constant<42>{}; - static_assert(sa.value() == 42); + constexpr auto sa = cuda::args::constant<42>{}; + static_assert(cuda::args::__unwrap(sa) == 42); static_assert(cuda::std::is_same_v); } // Different types { - constexpr auto sa_long = cuda::__argument::__constant<100L>{}; - static_assert(sa_long.value() == 100L); + constexpr auto sa_long = cuda::args::constant<100L>{}; + static_assert(cuda::args::__unwrap(sa_long) == 100L); static_assert(cuda::std::is_same_v); + + constexpr auto sa_float = cuda::args::constant<10, float>{}; + static_assert(cuda::args::__unwrap(sa_float) == 10.0f); + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); } // Negative value { - constexpr auto sa_neg = cuda::__argument::__constant<-1>{}; - static_assert(sa_neg.value() == -1); + constexpr auto sa_neg = cuda::args::constant<-1>{}; + static_assert(cuda::args::__unwrap(sa_neg) == -1); } #if TEST_HAS_CLASS_NTTP // Non-sequence values are accepted without scalar-only restrictions { - constexpr auto sa = cuda::__argument::__constant{}; - static_assert(sa.value().payload == 7); - static_assert(cuda::__argument::__unwrap(sa).payload == 7); + constexpr auto sa = cuda::args::constant{}; + static_assert(cuda::args::__unwrap(sa).payload == 7); } #endif // TEST_HAS_CLASS_NTTP #if TEST_HAS_CLASS_NTTP // Array sequence { - constexpr auto sa_arr = cuda::__argument::__constant_sequence{128, 256, 512}>{}; - static_assert(sa_arr.value()[0] == 128); - static_assert(sa_arr.value()[1] == 256); - static_assert(sa_arr.value()[2] == 512); + constexpr auto sa_arr = cuda::args::__constant_sequence{128, 256, 512}>{}; + static_assert(cuda::args::__unwrap(sa_arr)[0] == 128); + static_assert(cuda::args::__unwrap(sa_arr)[1] == 256); + static_assert(cuda::args::__unwrap(sa_arr)[2] == 512); static_assert(cuda::std::is_same_v>); } #endif // TEST_HAS_CLASS_NTTP // Bounds: scalar { - constexpr auto sa = cuda::__argument::__constant<42>{}; - static_assert(cuda::__argument::__lowest_(sa) == 42); - static_assert(cuda::__argument::__highest_(sa) == 42); + constexpr auto sa = cuda::args::constant<42>{}; + static_assert(cuda::args::__lowest_(sa) == 42); + static_assert(cuda::args::__highest_(sa) == 42); } #if TEST_HAS_CLASS_NTTP // Bounds: array sequence computes lowest/highest of elements { - constexpr auto sa = cuda::__argument::__constant_sequence{128, 256, 512}>{}; - static_assert(cuda::__argument::__lowest_(sa) == 128); - static_assert(cuda::__argument::__highest_(sa) == 512); + constexpr auto sa = cuda::args::__constant_sequence{128, 256, 512}>{}; + static_assert(cuda::args::__lowest_(sa) == 128); + static_assert(cuda::args::__highest_(sa) == 512); } #endif // TEST_HAS_CLASS_NTTP #if TEST_HAS_CLASS_NTTP // Bounds: empty array sequence has unconstrained element bounds { - constexpr auto sa = cuda::__argument::__constant_sequence{}>{}; - static_assert(cuda::__argument::__lowest_(sa) == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__highest_(sa) == (cuda::std::numeric_limits::max)()); + constexpr auto sa = cuda::args::__constant_sequence{}>{}; + static_assert(cuda::args::__lowest_(sa) == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__highest_(sa) == (cuda::std::numeric_limits::max)()); } #endif // TEST_HAS_CLASS_NTTP // Traits { - using traits = cuda::__argument::__traits>; + using traits = cuda::args::__traits>; static_assert(!traits::is_deferred); static_assert(traits::is_constant); static_assert(traits::is_single_value); @@ -98,10 +102,22 @@ TEST_FUNC void test() static_assert(traits::highest == 42); } + // Traits: explicit constant value type + { + using traits = cuda::args::__traits>; + static_assert(!traits::is_deferred); + static_assert(traits::is_constant); + static_assert(traits::is_single_value); + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); + static_assert(traits::lowest == 10.0f); + static_assert(traits::highest == 10.0f); + } + #if TEST_HAS_CLASS_NTTP // Sequence traits { - using traits = cuda::__argument::__traits{1, 2, 3}>>; + using traits = cuda::args::__traits{1, 2, 3}>>; static_assert(traits::is_constant); static_assert(!traits::is_deferred); static_assert(!traits::is_single_value); @@ -112,26 +128,33 @@ TEST_FUNC void test() // Single value: scalar is single, sequence is not { - static_assert( - !cuda::__argument::__is_sequence_v>::value_type>); + static_assert(!cuda::args::__is_sequence_v>::value_type>); #if TEST_HAS_CLASS_NTTP - static_assert(!cuda::__argument::__traits< - cuda::__argument::__constant_sequence{1, 2, 3}>>::is_single_value); + static_assert( + !cuda::args::__traits{1, 2, 3}>>::is_single_value); #endif // TEST_HAS_CLASS_NTTP } // Unwrap: scalar { - constexpr auto sa = cuda::__argument::__constant<42>{}; - constexpr auto val = cuda::__argument::__unwrap(sa); + constexpr auto sa = cuda::args::constant<42>{}; + constexpr auto val = cuda::args::__unwrap(sa); static_assert(val == 42); } + // Unwrap: scalar with explicit value type + { + constexpr auto sa = cuda::args::constant<10, float>{}; + constexpr auto val = cuda::args::__unwrap(sa); + static_assert(val == 10.0f); + static_assert(cuda::std::is_same_v); + } + #if TEST_HAS_CLASS_NTTP // Unwrap: sequence { - constexpr auto sa = cuda::__argument::__constant_sequence{10, 20, 30}>{}; - constexpr auto val = cuda::__argument::__unwrap(sa); + constexpr auto sa = cuda::args::__constant_sequence{10, 20, 30}>{}; + constexpr auto val = cuda::args::__unwrap(sa); static_assert(val[0] == 10); static_assert(val[2] == 30); } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp new file mode 100644 index 00000000000..b59d41fd7a1 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using arg_t = cuda::args::immediate>; + +[[maybe_unused]] arg_t invalid_arg{0}; + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp index 19e475ef453..5212c8a1f9c 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp @@ -8,11 +8,11 @@ // //===----------------------------------------------------------------------===// -#include +#include -using arg_t = cuda::__argument::__immediate>; +using arg_t = cuda::args::immediate>; -[[maybe_unused]] constexpr auto invalid_highest = cuda::__argument::__traits::highest; +[[maybe_unused]] constexpr auto invalid_highest = cuda::args::__traits::highest; int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp index 8d240e0cf3e..02ba7ecfe96 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp @@ -8,9 +8,9 @@ // //===----------------------------------------------------------------------===// -#include +#include -[[maybe_unused]] constexpr auto invalid_bounds = cuda::__argument::__static_bounds<0, 1L>{}; +[[maybe_unused]] constexpr auto invalid_bounds = cuda::args::static_bounds<0, 1L>{}; int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp index 8cc239585ac..eada29e23de 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp @@ -10,10 +10,10 @@ // Integration test: demonstrates how an algorithm consumes argument wrappers // to make compile-time and runtime resource decisions. -// All argument types (plain values, static, dynamic, deferred) work uniformly +// All argument types (plain values, constants, immediate values, deferred values) work uniformly // through the free functions. -#include +#include #include #include #include @@ -34,7 +34,7 @@ enum class algorithm_variant template TEST_FUNC constexpr algorithm_variant select_variant(_SegSizeArg) { - if constexpr (cuda::__argument::__traits<_SegSizeArg>::highest <= shared_memory_capacity) + if constexpr (cuda::args::__traits<_SegSizeArg>::highest <= shared_memory_capacity) { return algorithm_variant::shared_memory; } @@ -48,7 +48,7 @@ TEST_FUNC constexpr algorithm_variant select_variant(_SegSizeArg) template TEST_FUNC constexpr int compute_buffer_size(_SegSizeArg __seg_size, int __num_segments) { - auto __highest = cuda::std::min(default_max_segment_size, static_cast(cuda::__argument::__highest_(__seg_size))); + auto __highest = cuda::std::min(default_max_segment_size, static_cast(cuda::args::__highest_(__seg_size))); return __highest * __num_segments; } @@ -56,9 +56,9 @@ TEST_FUNC constexpr int compute_buffer_size(_SegSizeArg __seg_size, int __num_se template TEST_FUNC constexpr int process_segments(_SegSizeArg __seg_size) { - const auto& __val = cuda::__argument::__unwrap(__seg_size); + const auto& __val = cuda::args::__unwrap(__seg_size); - if constexpr (cuda::__argument::__traits<_SegSizeArg>::is_single_value) + if constexpr (cuda::args::__traits<_SegSizeArg>::is_single_value) { return static_cast(__val); } @@ -93,83 +93,82 @@ TEST_FUNC constexpr bool test() } #endif - // static_argument: scalar, fits in shared memory, buffer = value + // constant: scalar, fits in shared memory, buffer = value { - constexpr auto seg_size = cuda::__argument::__constant<128>{}; + constexpr auto seg_size = cuda::args::constant<128>{}; static_assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_size, 4) == 128 * 4); assert(process_segments(seg_size) == 128); } #if TEST_HAS_CLASS_NTTP - // static_argument: array sequence, highest fits in shared memory + // __constant_sequence: array sequence, highest fits in shared memory { - constexpr auto seg_sizes = cuda::__argument::__constant_sequence{}; + constexpr auto seg_sizes = cuda::args::__constant_sequence{}; static_assert(select_variant(seg_sizes) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_sizes, 3) == 256 * 3); assert(process_segments(seg_sizes) == 64 + 128 + 256); } - // static_argument: array sequence, highest exceeds shared memory, buffer clamped + // __constant_sequence: array sequence, highest exceeds shared memory, buffer clamped { - constexpr auto seg_sizes = cuda::__argument::__constant_sequence{}; + constexpr auto seg_sizes = cuda::args::__constant_sequence{}; static_assert(select_variant(seg_sizes) == algorithm_variant::global_memory); assert(compute_buffer_size(seg_sizes, 3) == 512 * 3); assert(process_segments(seg_sizes) == 64 + 128 + 512); } #endif // TEST_HAS_CLASS_NTTP - // dynamic_argument: tight static bounds, shared memory, buffer = value + // immediate: tight static bounds, shared memory, buffer = value { - constexpr auto seg_size = cuda::__argument::__immediate{100, cuda::__argument::__bounds<1, 256>()}; + constexpr auto seg_size = cuda::args::immediate{100, cuda::args::bounds<1, 256>()}; static_assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_size, 4) == 100 * 4); assert(process_segments(seg_size) == 100); } - // dynamic_argument: wide static bounds, global memory, buffer = value + // immediate: wide static bounds, global memory, buffer = value { - constexpr auto seg_size = cuda::__argument::__immediate{100, cuda::__argument::__bounds<1, 4096>()}; + constexpr auto seg_size = cuda::args::immediate{100, cuda::args::bounds<1, 4096>()}; static_assert(select_variant(seg_size) == algorithm_variant::global_memory); assert(compute_buffer_size(seg_size, 4) == 100 * 4); assert(process_segments(seg_size) == 100); } - // dynamic_argument: no bounds, global memory, buffer = value + // immediate: no bounds, global memory, buffer = value { - constexpr auto seg_size = cuda::__argument::__immediate{100}; + constexpr auto seg_size = cuda::args::immediate{100}; static_assert(select_variant(seg_size) == algorithm_variant::global_memory); assert(compute_buffer_size(seg_size, 4) == 100 * 4); assert(process_segments(seg_size) == 100); } - // dynamic_argument: per-segment span with runtime bounds only + // __immediate_sequence: per-segment span with runtime bounds only { - int sizes[3] = {64, 128, 96}; - auto seg_sizes = - cuda::__argument::__immediate_sequence{cuda::std::span{sizes, 3}, cuda::__argument::__bounds(1, 200)}; + int sizes[3] = {64, 128, 96}; + auto seg_sizes = cuda::args::__immediate_sequence{cuda::std::span{sizes, 3}, cuda::args::bounds(1, 200)}; assert(select_variant(seg_sizes) == algorithm_variant::global_memory); assert(compute_buffer_size(seg_sizes, 3) == 200 * 3); assert(process_segments(seg_sizes) == 64 + 128 + 96); } - // dynamic_argument: per-segment span with both bounds + // __immediate_sequence: per-segment span with both bounds { int sizes[3] = {64, 128, 96}; - auto seg_sizes = cuda::__argument::__immediate_sequence{ - cuda::std::span{sizes, 3}, cuda::__argument::__bounds<1, 256>(), cuda::__argument::__bounds(1, 200)}; - static_assert(cuda::__argument::__traits::highest <= shared_memory_capacity); + auto seg_sizes = cuda::args::__immediate_sequence{ + cuda::std::span{sizes, 3}, cuda::args::bounds<1, 256>(), cuda::args::bounds(1, 200)}; + static_assert(cuda::args::__traits::highest <= shared_memory_capacity); assert(select_variant(seg_sizes) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_sizes, 3) == 200 * 3); assert(process_segments(seg_sizes) == 64 + 128 + 96); } - // deferred_argument: uniform, bounds for decisions only + // deferred: uniform, bounds for decisions only { - int val = 100; - auto seg_size = cuda::__argument::__deferred{ - cuda::std::span{&val, 1}, cuda::__argument::__bounds<1, 256>(), cuda::__argument::__bounds(1, 200)}; - static_assert(cuda::__argument::__traits::highest <= shared_memory_capacity); + int val = 100; + auto seg_size = + cuda::args::deferred{cuda::std::span{&val, 1}, cuda::args::bounds<1, 256>(), cuda::args::bounds(1, 200)}; + static_assert(cuda::args::__traits::highest <= shared_memory_capacity); assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_size, 4) == 200 * 4); } @@ -182,17 +181,24 @@ TEST_FUNC constexpr bool test() assert(process_segments(1.0f) == 1); } + // constant float using an integer NTTP and explicit value type + { + constexpr auto seg_size = cuda::args::constant<128, float>{}; + static_assert(select_variant(seg_size) == algorithm_variant::shared_memory); + assert(process_segments(seg_size) == 128); + } + #if TEST_HAS_CLASS_NTTP - // static_argument float (float NTTPs require C++20) + // constant float (float NTTPs require C++20) { - constexpr auto seg_size = cuda::__argument::__constant<128.0f>{}; + constexpr auto seg_size = cuda::args::constant<128.0f>{}; static_assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(process_segments(seg_size) == 128); } - // dynamic_argument float with static bounds + // immediate float with static bounds { - constexpr auto seg_size = cuda::__argument::__immediate{100.0f, cuda::__argument::__bounds<1.0f, 256.0f>()}; + constexpr auto seg_size = cuda::args::immediate{100.0f, cuda::args::bounds<1.0f, 256.0f>()}; static_assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(process_segments(seg_size) == 100); }