From 06c277f7a6fe8cdc30ae0442fd570ec53a45437e Mon Sep 17 00:00:00 2001 From: Piotr Ciolkosz Date: Wed, 3 Jun 2026 22:57:33 -0700 Subject: [PATCH 1/5] Make argument wrappers construction public --- .../bench/segmented_topk/fixed/keys.cu | 12 +- .../bench/segmented_topk/variable/keys.cu | 14 +- cub/cub/agent/agent_batched_topk.cuh | 12 +- cub/cub/detail/segmented_params.cuh | 30 ++-- .../device/dispatch/dispatch_batched_topk.cuh | 20 +-- .../dispatch/kernels/kernel_batched_topk.cuh | 10 +- .../catch2_test_device_segmented_topk_keys.cu | 27 ++- ...catch2_test_device_segmented_topk_pairs.cu | 18 +- libcudacxx/include/cuda/__argument/argument.h | 160 ++++++++--------- .../include/cuda/__argument/argument_bounds.h | 4 +- libcudacxx/include/cuda/argument | 26 +++ .../include/cuda/std/__internal/namespaces.h | 4 +- .../cuda/argument/argument_bounds.pass.cpp | 42 +++-- .../cuda/argument/argument_traits.pass.cpp | 168 +++++++++--------- .../cuda/argument/deferred_argument.pass.cpp | 110 ++++++------ .../cuda/argument/dynamic_argument.pass.cpp | 119 +++++++------ .../cuda/argument/static_argument.pass.cpp | 50 +++--- .../static_bounds_conversion.fail.cpp | 6 +- .../static_bounds_type_mismatch.fail.cpp | 4 +- .../cuda/argument/usage_example.pass.cpp | 43 +++-- 20 files changed, 457 insertions(+), 422 deletions(-) create mode 100644 libcudacxx/include/cuda/argument diff --git a/cub/benchmarks/bench/segmented_topk/fixed/keys.cu b/cub/benchmarks/bench/segmented_topk/fixed/keys.cu index b8f13469dce..4178e7ea0d5 100644 --- a/cub/benchmarks/bench/segmented_topk/fixed/keys.cu +++ b/cub/benchmarks/bench/segmented_topk/fixed/keys.cu @@ -4,7 +4,7 @@ #include #include -#include +#include #include #include @@ -51,7 +51,7 @@ void fixed_seg_size_topk_keys( const auto selected_elements = static_cast<::cuda::std::ptrdiff_t>(MaxNumSelected); const auto num_segments = ::cuda::std::max(1, (max_elements / segment_size)); const auto elements = num_segments * segment_size; - const auto total_num_items = ::cuda::__argument::__immediate{static_cast<::cuda::std::int64_t>(elements)}; + const auto total_num_items = ::cuda::argument::immediate{static_cast<::cuda::std::int64_t>(elements)}; const bit_entropy entropy = str_to_entropy(state.get_string("Entropy")); // Skip workloads where k exceeds the segment size @@ -68,9 +68,9 @@ void fixed_seg_size_topk_keys( auto d_keys_in = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_in_ptr), segment_size); auto d_keys_out = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_out_ptr), selected_elements); - auto segment_sizes = ::cuda::__argument::__constant{}; - auto k = ::cuda::__argument::__constant{}; - auto select_direction = ::cuda::__argument::__constant{}; + auto segment_sizes = ::cuda::argument::constant{}; + auto k = ::cuda::argument::constant{}; + auto select_direction = ::cuda::argument::constant{}; state.add_element_count(elements, "NumElements"); state.add_element_count(segment_size, "SegmentSize"); @@ -99,7 +99,7 @@ void fixed_seg_size_topk_keys( segment_sizes, k, select_direction, - ::cuda::__argument::__immediate{static_cast<::cuda::std::int64_t>(num_segments)}, + ::cuda::argument::immediate{static_cast<::cuda::std::int64_t>(num_segments)}, total_num_items, env); }); diff --git a/cub/benchmarks/bench/segmented_topk/variable/keys.cu b/cub/benchmarks/bench/segmented_topk/variable/keys.cu index 3db3da44976..d14394afc39 100644 --- a/cub/benchmarks/bench/segmented_topk/variable/keys.cu +++ b/cub/benchmarks/bench/segmented_topk/variable/keys.cu @@ -8,7 +8,7 @@ #include #include -#include +#include #include #include #include @@ -172,17 +172,17 @@ void variable_seg_size_topk_keys(nvbench::state& state, static_cast(MaxSegmentSize)); const auto input_elements = thrust::reduce(d_segment_sizes.begin(), d_segment_sizes.end()); const auto output_elements = static_cast(num_segments) * K; - const auto total_num_items = ::cuda::__argument::__immediate{static_cast(input_elements)}; + const auto total_num_items = ::cuda::argument::immediate{static_cast(input_elements)}; auto in_keys_buffer = gen_data( num_segments, string_to_pattern(state.get_string("Pattern")), thrust::raw_pointer_cast(d_segment_sizes.data())); auto out_keys_buffer = thrust::device_vector(output_elements, thrust::no_init); - auto segment_sizes_param = ::cuda::__argument::__immediate_sequence{ - thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::__argument::__bounds<1, MaxSegmentSize>()}; - auto k_param = ::cuda::__argument::__constant{}; - auto select_direction = ::cuda::__argument::__constant{}; - auto num_segments_param = ::cuda::__argument::__immediate{static_cast(num_segments)}; + auto segment_sizes_param = ::cuda::argument::immediate_sequence{ + thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::argument::bounds<1, MaxSegmentSize>()}; + auto k_param = ::cuda::argument::constant{}; + auto select_direction = ::cuda::argument::constant{}; + auto num_segments_param = ::cuda::argument::immediate{static_cast(num_segments)}; auto d_keys_in = cuda::make_strided_iterator( cuda::make_counting_iterator(thrust::raw_pointer_cast(in_keys_buffer.data())), diff --git a/cub/cub/agent/agent_batched_topk.cuh b/cub/cub/agent/agent_batched_topk.cuh index c5ec20d26cd..bc10311d36d 100644 --- a/cub/cub/agent/agent_batched_topk.cuh +++ b/cub/cub/agent/agent_batched_topk.cuh @@ -23,8 +23,8 @@ #include #include -#include #include +#include CUB_NAMESPACE_BEGIN @@ -73,8 +73,8 @@ struct agent_batched_topk_worker_per_segment using key_t = it_value_t; using value_t = it_value_t; - using segment_size_val_t = typename ::cuda::__argument::__traits::element_type; - using num_segments_val_t = typename ::cuda::__argument::__traits::element_type; + using segment_size_val_t = typename ::cuda::argument::__traits::element_type; + using num_segments_val_t = typename ::cuda::argument::__traits::element_type; using counters_t = batched_topk_counters; static constexpr auto policy = PolicyGetter{}(); @@ -95,7 +95,7 @@ struct agent_batched_topk_worker_per_segment multi_worker_per_segment_policy.threads_per_block * multi_worker_per_segment_policy.items_per_thread; // Check if there could be large segments present - static constexpr bool only_small_segments = ::cuda::__argument::__traits::highest <= tile_size; + static constexpr bool only_small_segments = ::cuda::argument::__traits::highest <= tile_size; // Check if we are dealing with keys-only or key-value pairs static constexpr bool is_keys_only = ::cuda::std::is_same_v; @@ -196,8 +196,8 @@ struct agent_batched_topk_worker_per_segment return; } - constexpr bool is_full_tile = ::cuda::__argument::__traits::is_constant - && ::cuda::__argument::__traits::lowest == tile_size; + constexpr bool is_full_tile = ::cuda::argument::__traits::is_constant + && ::cuda::argument::__traits::lowest == tile_size; // Resolve Segment Parameters const auto segment_size = params::get_param(segment_sizes, segment_id); diff --git a/cub/cub/detail/segmented_params.cuh b/cub/cub/detail/segmented_params.cuh index fe5cc5c9162..e9a999a87b1 100644 --- a/cub/cub/detail/segmented_params.cuh +++ b/cub/cub/detail/segmented_params.cuh @@ -13,7 +13,7 @@ # pragma system_header #endif // no system header -#include +#include #include #include #include @@ -33,10 +33,10 @@ namespace detail::params //! @param[in] __index Segment index to read for sequence arguments. //! @return The single argument value, or the sequence element at the given index. _CCCL_TEMPLATE(class _Tp, class _SegmentIndexT) -_CCCL_REQUIRES((!::cuda::__argument::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) ) +_CCCL_REQUIRES((!::cuda::argument::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) ) [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(_Tp&& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept { - if constexpr (::cuda::__argument::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value) + if constexpr (::cuda::argument::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value) { return __arg; } @@ -48,44 +48,44 @@ _CCCL_REQUIRES((!::cuda::__argument::__is_wrapper_v<::cuda::std::remove_cvref_t< template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto -get_param(const ::cuda::__argument::__constant<_Value>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept +get_param(const ::cuda::argument::constant<_Value>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept { - return ::cuda::__argument::__unwrap(__arg); + return ::cuda::argument::__unwrap(__arg); } template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto -get_param(const ::cuda::__argument::__constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept +get_param(const ::cuda::argument::constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept { - return ::cuda::__argument::__unwrap(__arg)[__index]; + return ::cuda::argument::__unwrap(__arg)[__index]; } template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param( - const ::cuda::__argument::__immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept + const ::cuda::argument::immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept { - return ::cuda::__argument::__unwrap(__arg); + return ::cuda::argument::__unwrap(__arg); } template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto -get_param(const ::cuda::__argument::__immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept +get_param(const ::cuda::argument::immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept { - return ::cuda::__argument::__unwrap(__arg)[__index]; + return ::cuda::argument::__unwrap(__arg)[__index]; } template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param( - const ::cuda::__argument::__deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept + const ::cuda::argument::deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept { - return ::cuda::__argument::__unwrap(__arg); + return ::cuda::argument::__unwrap(__arg); } template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto -get_param(const ::cuda::__argument::__deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept +get_param(const ::cuda::argument::deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept { - return ::cuda::__argument::__unwrap(__arg)[__index]; + return ::cuda::argument::__unwrap(__arg)[__index]; } // ===================================================================== diff --git a/cub/cub/device/dispatch/dispatch_batched_topk.cuh b/cub/cub/device/dispatch/dispatch_batched_topk.cuh index d0f2d4eed0a..68a3a4f26b0 100644 --- a/cub/cub/device/dispatch/dispatch_batched_topk.cuh +++ b/cub/cub/device/dispatch/dispatch_batched_topk.cuh @@ -31,10 +31,10 @@ #include -#include #include #include #include +#include #include #include #include @@ -49,9 +49,9 @@ namespace detail::batched_topk // Internal: wrap user-facing select direction into discrete param for dispatch // ----------------------------------------------------------------------------- -// Uniform (compile-time): __constant -> single-option uniform_discrete_param. +// Uniform (compile-time): constant -> single-option uniform_discrete_param. template -[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::__argument::__constant) +[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::argument::constant) { return params::uniform_discrete_param{Dir}; } @@ -126,7 +126,7 @@ template >, it_value_t>, ::cuda::std::int64_t, - ::cuda::__argument::__traits::highest>> + ::cuda::argument::__traits::highest>> #if _CCCL_HAS_CONCEPTS() requires batched_topk_policy_selector #endif // _CCCL_HAS_CONCEPTS() @@ -145,7 +145,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( cudaStream_t stream = nullptr, [[maybe_unused]] PolicySelector policy_selector = {}) { - using large_segment_tile_offset_t = typename ::cuda::__argument::__traits::element_type; + using large_segment_tile_offset_t = typename ::cuda::argument::__traits::element_type; // Wrap the raw enum into the internal discrete param type auto select_directions = wrap_select_direction(select_direction); @@ -171,9 +171,9 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( static constexpr int worker_per_segment_tile_size = worker_per_segment_policy.threads_per_block * worker_per_segment_policy.items_per_thread; static constexpr bool any_small_segments = - ::cuda::__argument::__traits::lowest <= worker_per_segment_tile_size; + ::cuda::argument::__traits::lowest <= worker_per_segment_tile_size; static constexpr bool only_small_segments = - ::cuda::__argument::__traits::highest <= worker_per_segment_tile_size; + ::cuda::argument::__traits::highest <= worker_per_segment_tile_size; // Allocation layout: // only_small_segments: [0] dummy. @@ -183,7 +183,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( static constexpr int allocations_array_size = only_small_segments ? 1 : (any_small_segments ? 3 : 2); size_t allocation_sizes[allocations_array_size] = {1}; - using num_segments_val_t = typename ::cuda::__argument::__traits::element_type; + using num_segments_val_t = typename ::cuda::argument::__traits::element_type; using counters_t = batched_topk_counters; using segment_size_scan_offset_t = detail::choose_offset_t; using segment_size_scan_input_op_t = @@ -239,7 +239,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( // TODO (elstehle): support number of segments provided by device-accessible iterator // Only uniform number of segments are supported (i.e., we need to resolve the number of segments on the host) - static_assert(::cuda::__argument::__traits::is_single_value, + static_assert(::cuda::argument::__traits::is_single_value, "Only uniform segment sizes are currently supported."); if constexpr (any_small_segments) @@ -341,7 +341,7 @@ template >, it_value_t>, ::cuda::std::int64_t, - ::cuda::__argument::__traits::highest>; + ::cuda::argument::__traits::highest>; return detail::dispatch_with_env_and_tuning( env, [&](auto policy_selector, void* d_temp_storage, size_t& temp_storage_bytes, cudaStream_t stream) { return dispatch( diff --git a/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh b/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh index 1ff50dfaf67..9c904f95971 100644 --- a/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh @@ -20,8 +20,8 @@ #include #include -#include #include +#include CUB_NAMESPACE_BEGIN @@ -39,7 +39,7 @@ private: worker_policy worker_per_segment_policy; multi_worker_policy multi_worker_per_segment_policy; }; - static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::__argument::__traits::highest; + static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::argument::__traits::highest; static constexpr batched_topk_policy active_policy = current_policy(); template @@ -133,8 +133,8 @@ __launch_bounds__(int( KParameterT k, SelectDirectionParameterT select_directions, NumSegmentsParameterT num_segments, - batched_topk_counters::element_type>* d_counters, - typename ::cuda::__argument::__traits::element_type* d_large_segments_ids, + batched_topk_counters::element_type>* d_counters, + typename ::cuda::argument::__traits::element_type* d_large_segments_ids, LargeSegmentTileOffsetT* d_large_segments_tile_offsets) { using agent_t = typename find_smallest_covering_policy< @@ -151,7 +151,7 @@ __launch_bounds__(int( LargeSegmentTileOffsetT>::agent_t; // Static Assertions (Constraints) - static_assert(agent_t::tile_size >= ::cuda::__argument::__traits::highest, + static_assert(agent_t::tile_size >= ::cuda::argument::__traits::highest, "Block size exceeds maximum segment size supported by SegmentSizeParameterT"); static_assert(sizeof(typename agent_t::TempStorage) <= max_smem_per_block, "Static shared memory per block must not exceed 48KB limit."); diff --git a/cub/test/catch2_test_device_segmented_topk_keys.cu b/cub/test/catch2_test_device_segmented_topk_keys.cu index 3d00c1119cc..e16d5637f19 100644 --- a/cub/test/catch2_test_device_segmented_topk_keys.cu +++ b/cub/test/catch2_test_device_segmented_topk_keys.cu @@ -151,11 +151,11 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small fixed-size segments", batched_topk_keys( d_keys_in, d_keys_out, - ::cuda::__argument::__immediate{segment_size, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds()}, + ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds()}, + ::cuda::argument::immediate{k, ::cuda::argument::bounds()}, direction, - ::cuda::__argument::__immediate{num_segments}, - ::cuda::__argument::__immediate{num_segments * segment_size}); + ::cuda::argument::immediate{num_segments}, + ::cuda::argument::immediate{num_segments * segment_size}); // Prepare expected results fixed_size_segmented_sort_keys(expected_keys, num_segments, segment_size, direction); compact_sorted_keys_to_topk(expected_keys, segment_size, k); @@ -248,12 +248,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small variable-size segment batched_topk_keys( d_keys_in, d_keys_out, - ::cuda::__argument::__immediate_sequence{ - segment_size_it, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds()}, + ::cuda::argument::immediate_sequence{ + segment_size_it, ::cuda::argument::bounds()}, + ::cuda::argument::immediate{k, ::cuda::argument::bounds()}, direction, - ::cuda::__argument::__immediate{num_segments}, - ::cuda::__argument::__immediate{num_items}); + ::cuda::argument::immediate{num_segments}, + ::cuda::argument::immediate{num_items}); // Verify keys are returned correctly: sort each segment of the expected input, then compact the top-k segmented_sort_keys(expected_keys, num_segments, segment_offsets.cbegin(), segment_offsets.cbegin() + 1, direction); @@ -286,12 +286,11 @@ C2H_TEST("DeviceBatchedTopK::MinKeys preserves -0.0f in output", "[keys][segment batched_topk_keys( d_keys_in_it, d_keys_out_it, - ::cuda::__argument::__immediate{ - segment_size, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds()}, + ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds()}, + ::cuda::argument::immediate{k, ::cuda::argument::bounds()}, cub::detail::topk::select::min, - ::cuda::__argument::__immediate{num_segments}, - ::cuda::__argument::__immediate{num_segments * segment_size}); + ::cuda::argument::immediate{num_segments}, + ::cuda::argument::immediate{num_segments * segment_size}); const int num_minus_zero = static_cast(thrust::count_if(d_keys_out.begin(), d_keys_out.end(), is_minus_zero{})); REQUIRE(num_minus_zero >= 1); diff --git a/cub/test/catch2_test_device_segmented_topk_pairs.cu b/cub/test/catch2_test_device_segmented_topk_pairs.cu index cc34ceba3c6..eb2825f15d2 100644 --- a/cub/test/catch2_test_device_segmented_topk_pairs.cu +++ b/cub/test/catch2_test_device_segmented_topk_pairs.cu @@ -220,11 +220,11 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small fixed-size segments" d_keys_out, d_values_in, d_values_out, - ::cuda::__argument::__immediate{segment_size, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds()}, + ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds()}, + ::cuda::argument::immediate{k, ::cuda::argument::bounds()}, direction, - ::cuda::__argument::__immediate{num_segments}, - ::cuda::__argument::__immediate{num_segments * segment_size}); + ::cuda::argument::immediate{num_segments}, + ::cuda::argument::immediate{num_segments * segment_size}); // Verification: // - We verify correct top-k selection through the keys @@ -340,12 +340,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small variable-size segmen d_keys_out, d_values_in, d_values_out, - ::cuda::__argument::__immediate_sequence{ - segment_size_it, ::cuda::__argument::__bounds()}, - ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds()}, + ::cuda::argument::immediate_sequence{ + segment_size_it, ::cuda::argument::bounds()}, + ::cuda::argument::immediate{k, ::cuda::argument::bounds()}, direction, - ::cuda::__argument::__immediate{num_segments}, - ::cuda::__argument::__immediate{num_items}); + ::cuda::argument::immediate{num_segments}, + ::cuda::argument::immediate{num_items}); // Verification: // - We verify correct top-k selection through the keys diff --git a/libcudacxx/include/cuda/__argument/argument.h b/libcudacxx/include/cuda/__argument/argument.h index fb514d22648..42d66233c6e 100644 --- a/libcudacxx/include/cuda/__argument/argument.h +++ b/libcudacxx/include/cuda/__argument/argument.h @@ -94,7 +94,7 @@ inline constexpr bool __is_iterable_v<_Tp, decltype(::cuda::std::declval().end())>> = true; // ===================================================================== -// __constant +// constant // ===================================================================== // Non-sequence wrappers intentionally do not reject types with a distinct element type. @@ -103,7 +103,7 @@ inline constexpr bool __is_iterable_v<_Tp, //! @brief Wraps a compile-time constant argument value. template -struct __constant +struct constant { using value_type = ::cuda::std::remove_cvref_t; using __element_type = value_type; @@ -116,7 +116,7 @@ struct __constant //! @brief Wraps a compile-time constant argument sequence. template -struct __constant_sequence +struct constant_sequence { using value_type = ::cuda::std::remove_cvref_t; using __element_type = __element_type_of_t; @@ -254,14 +254,14 @@ _CCCL_API constexpr void __validate_runtime_element_bounds( } // ===================================================================== -// __immediate +// immediate // ===================================================================== //! @brief Wraps a runtime argument value with optional bounds. //! //! The value is host-accessible at API call time. template -struct __immediate +struct immediate { using __element_type = __element_type_of_t<_Arg>; @@ -281,13 +281,13 @@ struct __immediate } public: - _CCCL_API constexpr __immediate(_Arg __arg) noexcept + _CCCL_API constexpr immediate(_Arg __arg) noexcept : __arg_{::cuda::std::move(__arg)} { __validate_value(); } - _CCCL_API constexpr __immediate(_Arg __arg, _StaticBounds) noexcept + _CCCL_API constexpr immediate(_Arg __arg, _StaticBounds) noexcept : __arg_{::cuda::std::move(__arg)} { __validate_value(); @@ -296,18 +296,18 @@ struct __immediate #ifndef _CCCL_DOXYGEN_INVOKED template -_CCCL_HOST_DEVICE __immediate(_Arg, __static_bounds<_Lowest, _Highest>) - -> __immediate<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE immediate(_Arg, __static_bounds<_Lowest, _Highest>) + -> immediate<_Arg, __static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED // ===================================================================== -// __immediate_sequence +// immediate_sequence // ===================================================================== //! @brief Wraps a runtime argument sequence with optional bounds. template -struct __immediate_sequence +struct immediate_sequence { using __element_type = __element_type_of_t<_Arg>; @@ -342,14 +342,14 @@ struct __immediate_sequence } public: - _CCCL_API constexpr __immediate_sequence(_Arg __arg) noexcept + _CCCL_API constexpr immediate_sequence(_Arg __arg) noexcept : __arg_{::cuda::std::move(__arg)} { __validate_bounds(); __validate_value(); } - _CCCL_API constexpr __immediate_sequence(_Arg __arg, _StaticBounds) noexcept + _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds) noexcept : __arg_{::cuda::std::move(__arg)} { __validate_bounds(); @@ -357,7 +357,7 @@ struct __immediate_sequence } template - _CCCL_API constexpr __immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb) noexcept + _CCCL_API constexpr immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb) noexcept : __arg_{::cuda::std::move(__arg)} , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()), __runtime_bound_cast<__element_type>(__rb.upper())} @@ -367,7 +367,7 @@ struct __immediate_sequence } template - _CCCL_API constexpr __immediate_sequence(_Arg __arg, _StaticBounds, __runtime_bounds<_BoundsTp> __rb) noexcept + _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds, __runtime_bounds<_BoundsTp> __rb) noexcept : __arg_{::cuda::std::move(__arg)} , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()), __runtime_bound_cast<__element_type>(__rb.upper())} @@ -377,27 +377,27 @@ struct __immediate_sequence } template - _CCCL_API constexpr __immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept - : __immediate_sequence(::cuda::std::move(__arg), __sb, __rb) + _CCCL_API constexpr immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept + : immediate_sequence(::cuda::std::move(__arg), __sb, __rb) {} }; #ifndef _CCCL_DOXYGEN_INVOKED template -_CCCL_HOST_DEVICE __immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>) - -> __immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>) + -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE __immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>) - -> __immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>) + -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE __immediate_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>) - -> __immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE immediate_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>) + -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED // ===================================================================== -// __deferred_base / __deferred / __deferred_sequence +// __deferred_base / deferred / deferred_sequence // ===================================================================== //! @brief Common base for deferred argument wrappers. @@ -451,57 +451,57 @@ struct __deferred_base //! @brief Wraps a reference to a single value that is potentially not available at API call time but will be available //! by the time the argument is consumed in stream order. template -struct __deferred : __deferred_base<_Arg, _StaticBounds> +struct deferred : __deferred_base<_Arg, _StaticBounds> { using __deferred_base<_Arg, _StaticBounds>::__deferred_base; }; #ifndef _CCCL_DOXYGEN_INVOKED template -_CCCL_HOST_DEVICE __deferred(_Arg) -> __deferred<_Arg>; +_CCCL_HOST_DEVICE deferred(_Arg) -> deferred<_Arg>; template -_CCCL_HOST_DEVICE __deferred(_Arg, __static_bounds<_Lowest, _Highest>) - -> __deferred<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred(_Arg, __static_bounds<_Lowest, _Highest>) + -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE __deferred(_Arg, __runtime_bounds<_Tp>) -> __deferred<_Arg>; +_CCCL_HOST_DEVICE deferred(_Arg, __runtime_bounds<_Tp>) -> deferred<_Arg>; template -_CCCL_HOST_DEVICE __deferred(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>) - -> __deferred<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>) + -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE __deferred(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>) - -> __deferred<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>) + -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED //! @brief Wraps a reference to a sequence of values that is potentially not available at API call time but will be //! available by the time the argument is consumed in stream order. template -struct __deferred_sequence : __deferred_base<_Arg, _StaticBounds> +struct deferred_sequence : __deferred_base<_Arg, _StaticBounds> { using __deferred_base<_Arg, _StaticBounds>::__deferred_base; }; #ifndef _CCCL_DOXYGEN_INVOKED template -_CCCL_HOST_DEVICE __deferred_sequence(_Arg) -> __deferred_sequence<_Arg>; +_CCCL_HOST_DEVICE deferred_sequence(_Arg) -> deferred_sequence<_Arg>; template -_CCCL_HOST_DEVICE __deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>) - -> __deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>) + -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE __deferred_sequence(_Arg, __runtime_bounds<_Tp>) -> __deferred_sequence<_Arg>; +_CCCL_HOST_DEVICE deferred_sequence(_Arg, __runtime_bounds<_Tp>) -> deferred_sequence<_Arg>; template -_CCCL_HOST_DEVICE __deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>) - -> __deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>) + -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE __deferred_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>) - -> __deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>) + -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED // ===================================================================== @@ -511,17 +511,17 @@ _CCCL_HOST_DEVICE __deferred_sequence(_Arg, __runtime_bounds<_Tp>, __static_boun template inline constexpr bool __is_wrapper_v = false; template -inline constexpr bool __is_wrapper_v<__immediate<_Arg, _StaticBounds>> = true; +inline constexpr bool __is_wrapper_v> = true; template -inline constexpr bool __is_wrapper_v<__constant<_Value>> = true; +inline constexpr bool __is_wrapper_v> = true; template -inline constexpr bool __is_wrapper_v<__constant_sequence<_Value>> = true; +inline constexpr bool __is_wrapper_v> = true; template -inline constexpr bool __is_wrapper_v<__immediate_sequence<_Arg, _StaticBounds>> = true; +inline constexpr bool __is_wrapper_v> = true; template -inline constexpr bool __is_wrapper_v<__deferred<_Arg, _StaticBounds>> = true; +inline constexpr bool __is_wrapper_v> = true; template -inline constexpr bool __is_wrapper_v<__deferred_sequence<_Arg, _StaticBounds>> = true; +inline constexpr bool __is_wrapper_v> = true; _CCCL_TEMPLATE(class _Tp) _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) ) @@ -531,87 +531,87 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) ) } template -[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__immediate<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(immediate<_Arg, _StaticBounds>& __arg) noexcept { return __arg.__arg_; } template -[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __immediate<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const immediate<_Arg, _StaticBounds>& __arg) noexcept { return __arg.__arg_; } template -[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__immediate<_Arg, _StaticBounds>&& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(immediate<_Arg, _StaticBounds>&& __arg) noexcept { return ::cuda::std::move(__arg.__arg_); } template [[nodiscard]] _CCCL_API constexpr ::cuda::std::remove_cvref_t -__unwrap(const __constant<_Value>&) noexcept +__unwrap(const constant<_Value>&) noexcept { return _Value; } template [[nodiscard]] _CCCL_API constexpr ::cuda::std::remove_cvref_t -__unwrap(const __constant_sequence<_Value>&) noexcept +__unwrap(const constant_sequence<_Value>&) noexcept { return _Value; } template -[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept { return __arg.__arg_; } template -[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept { return __arg.__arg_; } template -[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__immediate_sequence<_Arg, _StaticBounds>&& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(immediate_sequence<_Arg, _StaticBounds>&& __arg) noexcept { return ::cuda::std::move(__arg.__arg_); } template -[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__deferred<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(deferred<_Arg, _StaticBounds>& __arg) noexcept { return __arg.__arg_; } template -[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __deferred<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const deferred<_Arg, _StaticBounds>& __arg) noexcept { return __arg.__arg_; } template -[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__deferred<_Arg, _StaticBounds>&& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(deferred<_Arg, _StaticBounds>&& __arg) noexcept { return ::cuda::std::move(__arg.__arg_); } template -[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept { return __arg.__arg_; } template -[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept { return __arg.__arg_; } template -[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__deferred_sequence<_Arg, _StaticBounds>&& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(deferred_sequence<_Arg, _StaticBounds>&& __arg) noexcept { return ::cuda::std::move(__arg.__arg_); } @@ -677,7 +677,7 @@ struct __traits_impl }; template -struct __traits_impl<__constant<_Value>> +struct __traits_impl> { using value_type = ::cuda::std::remove_cvref_t; using element_type = value_type; @@ -689,7 +689,7 @@ struct __traits_impl<__constant<_Value>> }; template -struct __traits_impl<__immediate<_Arg, _StaticBounds>> +struct __traits_impl> { using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; @@ -704,7 +704,7 @@ struct __traits_impl<__immediate<_Arg, _StaticBounds>> }; template -struct __traits_impl<__constant_sequence<_Value>> +struct __traits_impl> { using value_type = ::cuda::std::remove_cvref_t; using element_type = __element_type_of_t; @@ -717,7 +717,7 @@ struct __traits_impl<__constant_sequence<_Value>> }; template -struct __traits_impl<__immediate_sequence<_Arg, _StaticBounds>> +struct __traits_impl> { using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; @@ -733,7 +733,7 @@ struct __traits_impl<__immediate_sequence<_Arg, _StaticBounds>> }; template -struct __traits_impl<__deferred<_Arg, _StaticBounds>> +struct __traits_impl> { using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; @@ -748,7 +748,7 @@ struct __traits_impl<__deferred<_Arg, _StaticBounds>> }; template -struct __traits_impl<__deferred_sequence<_Arg, _StaticBounds>> +struct __traits_impl> { using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; @@ -779,25 +779,25 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cv_t<_Tp>>) ) } template -[[nodiscard]] _CCCL_API constexpr auto __lowest_(__constant<_Value>) noexcept +[[nodiscard]] _CCCL_API constexpr auto __lowest_(constant<_Value>) noexcept { return __constant_compute_lowest<_Value>(); } template -[[nodiscard]] _CCCL_API constexpr auto __lowest_(__constant_sequence<_Value>) noexcept +[[nodiscard]] _CCCL_API constexpr auto __lowest_(constant_sequence<_Value>) noexcept { return __constant_sequence_compute_lowest<_Value>(); } template -[[nodiscard]] _CCCL_API constexpr auto __lowest_(__immediate<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __lowest_(immediate<_Arg, _StaticBounds> __arg) noexcept { return __arg.__arg_; } template -[[nodiscard]] _CCCL_API constexpr auto __lowest_(__immediate_sequence<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __lowest_(immediate_sequence<_Arg, _StaticBounds> __arg) noexcept { using _ET = __element_type_of_t<_Arg>; __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); @@ -805,7 +805,7 @@ template } template -[[nodiscard]] _CCCL_API constexpr auto __lowest_(__deferred<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __lowest_(deferred<_Arg, _StaticBounds> __arg) noexcept { using _ET = __element_type_of_t<_Arg>; __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); @@ -813,7 +813,7 @@ template } template -[[nodiscard]] _CCCL_API constexpr auto __lowest_(__deferred_sequence<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __lowest_(deferred_sequence<_Arg, _StaticBounds> __arg) noexcept { using _ET = __element_type_of_t<_Arg>; __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); @@ -829,25 +829,25 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cv_t<_Tp>>) ) } template -[[nodiscard]] _CCCL_API constexpr auto __highest_(__constant<_Value>) noexcept +[[nodiscard]] _CCCL_API constexpr auto __highest_(constant<_Value>) noexcept { return __constant_compute_highest<_Value>(); } template -[[nodiscard]] _CCCL_API constexpr auto __highest_(__constant_sequence<_Value>) noexcept +[[nodiscard]] _CCCL_API constexpr auto __highest_(constant_sequence<_Value>) noexcept { return __constant_sequence_compute_highest<_Value>(); } template -[[nodiscard]] _CCCL_API constexpr auto __highest_(__immediate<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __highest_(immediate<_Arg, _StaticBounds> __arg) noexcept { return __arg.__arg_; } template -[[nodiscard]] _CCCL_API constexpr auto __highest_(__immediate_sequence<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __highest_(immediate_sequence<_Arg, _StaticBounds> __arg) noexcept { using _ET = __element_type_of_t<_Arg>; __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); @@ -855,7 +855,7 @@ template } template -[[nodiscard]] _CCCL_API constexpr auto __highest_(__deferred<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __highest_(deferred<_Arg, _StaticBounds> __arg) noexcept { using _ET = __element_type_of_t<_Arg>; __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); @@ -863,7 +863,7 @@ template } template -[[nodiscard]] _CCCL_API constexpr auto __highest_(__deferred_sequence<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __highest_(deferred_sequence<_Arg, _StaticBounds> __arg) noexcept { using _ET = __element_type_of_t<_Arg>; __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); diff --git a/libcudacxx/include/cuda/__argument/argument_bounds.h b/libcudacxx/include/cuda/__argument/argument_bounds.h index 11c46c74417..5ccaafb4589 100644 --- a/libcudacxx/include/cuda/__argument/argument_bounds.h +++ b/libcudacxx/include/cuda/__argument/argument_bounds.h @@ -119,7 +119,7 @@ inline constexpr bool __is_runtime_bounds_v<__runtime_bounds<_Tp>> = true; //! @tparam _Upper The static upper bound. //! @return A compile-time bounds object. template -[[nodiscard]] _CCCL_API constexpr __static_bounds<_Lower, _Upper> __bounds() noexcept +[[nodiscard]] _CCCL_API constexpr __static_bounds<_Lower, _Upper> bounds() noexcept { return {}; } @@ -130,7 +130,7 @@ template //! @param __upper The runtime upper bound. //! @return A runtime bounds object. template -[[nodiscard]] _CCCL_API constexpr __runtime_bounds<_Tp> __bounds(_Tp __lower, _Tp __upper) noexcept +[[nodiscard]] _CCCL_API constexpr __runtime_bounds<_Tp> bounds(_Tp __lower, _Tp __upper) noexcept { return {__lower, __upper}; } diff --git a/libcudacxx/include/cuda/argument b/libcudacxx/include/cuda/argument new file mode 100644 index 00000000000..bcb079e7e5e --- /dev/null +++ b/libcudacxx/include/cuda/argument @@ -0,0 +1,26 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#ifndef _CUDA_ARGUMENT_ +#define _CUDA_ARGUMENT_ + +#include + +#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC) +# pragma GCC system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG) +# pragma clang system_header +#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC) +# pragma system_header +#endif // no system header + +#include + +#endif // _CUDA_ARGUMENT_ diff --git a/libcudacxx/include/cuda/std/__internal/namespaces.h b/libcudacxx/include/cuda/std/__internal/namespaces.h index 325830c17cf..f3cc191adfc 100644 --- a/libcudacxx/include/cuda/std/__internal/namespaces.h +++ b/libcudacxx/include/cuda/std/__internal/namespaces.h @@ -115,8 +115,8 @@ #define _CCCL_BEGIN_NAMESPACE_CUDA_EXECUTION _CCCL_BEGIN_NAMESPACE(cuda::execution) #define _CCCL_END_NAMESPACE_CUDA_EXECUTION _CCCL_END_NAMESPACE(cuda::execution) -#define _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT _CCCL_BEGIN_NAMESPACE(cuda::__argument) -#define _CCCL_END_NAMESPACE_CUDA_ARGUMENT _CCCL_END_NAMESPACE(cuda::__argument) +#define _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT _CCCL_BEGIN_NAMESPACE(cuda::argument) +#define _CCCL_END_NAMESPACE_CUDA_ARGUMENT _CCCL_END_NAMESPACE(cuda::argument) // Namespace to avoid name collisions with CPOs on clang-16 (see // https://godbolt.org/z/9TadonrdM for example). MSVC's ancient parser also gets confused with diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp index c03b685e821..9a5717f32c3 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp @@ -8,7 +8,7 @@ // //===----------------------------------------------------------------------===// -#include +#include #include #include @@ -20,27 +20,27 @@ TEST_FUNC constexpr bool test() // Basic static bounds { - constexpr auto b = cuda::__argument::__static_bounds<1, 4096>{}; + constexpr auto b = cuda::argument::__static_bounds<1, 4096>{}; static_assert(b.lower() == 1); static_assert(b.upper() == 4096); } // Exact static bounds { - constexpr auto b = cuda::__argument::__static_bounds<42, 42>{}; + constexpr auto b = cuda::argument::__static_bounds<42, 42>{}; static_assert(b.lower() == 42); static_assert(b.upper() == 42); } // Long type deduced from NTTPs { - static_assert(cuda::std::is_same_v::lower()), long>); + static_assert(cuda::std::is_same_v::lower()), long>); } #if TEST_HAS_CLASS_NTTP // Static bounds preserve their original NTTP types { - constexpr auto b = cuda::__argument::__bounds<1.0f, 8.0f>(); + constexpr auto b = cuda::argument::bounds<1.0f, 8.0f>(); static_assert(b.lower() == 1.0f); static_assert(b.upper() == 8); static_assert(cuda::std::is_same_v); @@ -52,9 +52,13 @@ TEST_FUNC constexpr bool test() // Basic runtime bounds { - auto b = cuda::__argument::__runtime_bounds{10, 100}; + auto b = cuda::argument::__runtime_bounds{10, 100}; assert(b.lower() == 10); assert(b.upper() == 100); + assert(b.__lower_ == 10); + assert(b.__upper_ == 100); + b.__upper_ = 90; + assert(b.upper() == 90); static_assert(cuda::std::is_same_v); } @@ -62,35 +66,35 @@ TEST_FUNC constexpr bool test() // Static via factory { - constexpr auto b = cuda::__argument::__bounds<1, 8>(); + constexpr auto b = cuda::argument::bounds<1, 8>(); static_assert(b.lower() == 1); static_assert(b.upper() == 8); - static_assert(cuda::__argument::__is_static_bounds_cv_v); - static_assert(!cuda::__argument::__is_runtime_bounds_cv_v); - static_assert(cuda::__argument::__is_bounds_v); + static_assert(cuda::argument::__is_static_bounds_cv_v); + static_assert(!cuda::argument::__is_runtime_bounds_cv_v); + static_assert(cuda::argument::__is_bounds_v); } // Runtime via factory { - auto b = cuda::__argument::__bounds(10, 100); + auto b = cuda::argument::bounds(10, 100); assert(b.lower() == 10); assert(b.upper() == 100); - static_assert(!cuda::__argument::__is_static_bounds_cv_v); - static_assert(cuda::__argument::__is_runtime_bounds_cv_v); - static_assert(cuda::__argument::__is_bounds_v); + static_assert(!cuda::argument::__is_static_bounds_cv_v); + static_assert(cuda::argument::__is_runtime_bounds_cv_v); + static_assert(cuda::argument::__is_bounds_v); } // Static and runtime bounds intersection { - static_assert(cuda::__argument::__has_bounds_intersection>( - cuda::__argument::__runtime_bounds{50, 200})); - static_assert(!cuda::__argument::__has_bounds_intersection>( - cuda::__argument::__runtime_bounds{0, 50})); + static_assert(cuda::argument::__has_bounds_intersection>( + cuda::argument::__runtime_bounds{50, 200})); + static_assert(!cuda::argument::__has_bounds_intersection>( + cuda::argument::__runtime_bounds{0, 50})); } // Non-bounds type { - static_assert(!cuda::__argument::__is_bounds_v); + static_assert(!cuda::argument::__is_bounds_v); } return true; diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp index cdcc6665747..50a6b51c932 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp @@ -8,7 +8,7 @@ // //===----------------------------------------------------------------------===// -#include +#include #include #include #include @@ -50,134 +50,128 @@ TEST_FUNC void test() { // --- __is_sequence_v / __is_single_value_v --- - static_assert(!cuda::__argument::__is_sequence_v); - static_assert(!cuda::__argument::__is_sequence_v); - static_assert(!cuda::__argument::__is_sequence_v); - static_assert(cuda::__argument::__is_sequence_v); - static_assert(cuda::__argument::__is_sequence_v>); - static_assert(cuda::__argument::__is_sequence_v>); - static_assert(cuda::__argument::__is_sequence_v&>); - static_assert(cuda::__argument::__is_sequence_v>); - static_assert(cuda::__argument::__is_sequence_v>); - static_assert(cuda::__argument::__is_sequence_v>); - static_assert(cuda::__argument::__is_sequence_v>); - static_assert(cuda::__argument::__is_sequence_v>); - - static_assert(cuda::__argument::__is_single_value_v); - static_assert(cuda::__argument::__is_single_value_v); - static_assert(cuda::__argument::__is_single_value_v); - static_assert(cuda::__argument::__is_single_value_v); - static_assert(cuda::__argument::__is_single_value_v); - static_assert(cuda::__argument::__is_single_value_v); - static_assert(!cuda::__argument::__is_single_value_v); - static_assert(!cuda::__argument::__is_single_value_v>); - static_assert(!cuda::__argument::__is_single_value_v>); - static_assert(!cuda::__argument::__is_single_value_v&>); - static_assert(!cuda::__argument::__is_single_value_v>); - static_assert(!cuda::__argument::__is_single_value_v>); - static_assert(!cuda::__argument::__is_single_value_v>); + static_assert(!cuda::argument::__is_sequence_v); + static_assert(!cuda::argument::__is_sequence_v); + static_assert(!cuda::argument::__is_sequence_v); + static_assert(cuda::argument::__is_sequence_v); + static_assert(cuda::argument::__is_sequence_v>); + static_assert(cuda::argument::__is_sequence_v>); + static_assert(cuda::argument::__is_sequence_v&>); + static_assert(cuda::argument::__is_sequence_v>); + static_assert(cuda::argument::__is_sequence_v>); + static_assert(cuda::argument::__is_sequence_v>); + static_assert(cuda::argument::__is_sequence_v>); + static_assert(cuda::argument::__is_sequence_v>); + + static_assert(cuda::argument::__is_single_value_v); + static_assert(cuda::argument::__is_single_value_v); + static_assert(cuda::argument::__is_single_value_v); + static_assert(cuda::argument::__is_single_value_v); + static_assert(cuda::argument::__is_single_value_v); + static_assert(cuda::argument::__is_single_value_v); + static_assert(!cuda::argument::__is_single_value_v); + static_assert(!cuda::argument::__is_single_value_v>); + static_assert(!cuda::argument::__is_single_value_v>); + static_assert(!cuda::argument::__is_single_value_v&>); + static_assert(!cuda::argument::__is_single_value_v>); + static_assert(!cuda::argument::__is_single_value_v>); + static_assert(!cuda::argument::__is_single_value_v>); // --- __element_type_of_t --- - static_assert(cuda::std::is_same_v&>, int>); - static_assert(cuda::std::is_same_v, int>); - static_assert(cuda::std::is_same_v>, int>); - static_assert(cuda::std::is_same_v>, int>); - static_assert(cuda::std::is_same_v>, int>); - static_assert(cuda::std::is_same_v>, int>); + static_assert(cuda::std::is_same_v&>, int>); + static_assert(cuda::std::is_same_v, int>); + static_assert(cuda::std::is_same_v>, int>); + static_assert(cuda::std::is_same_v>, int>); + static_assert(cuda::std::is_same_v>, int>); + static_assert(cuda::std::is_same_v>, int>); static_assert( - cuda::std::is_same_v>>, + cuda::std::is_same_v>>, int>); - static_assert(cuda::std::is_same_v>, int>); + static_assert(cuda::std::is_same_v>, int>); // --- argument_traits: is_deferred --- - static_assert(!cuda::__argument::__traits::is_deferred); - static_assert(!cuda::__argument::__traits>::is_deferred); - static_assert(!cuda::__argument::__traits>>::is_deferred); - static_assert(!cuda::__argument::__traits>::is_deferred); + static_assert(!cuda::argument::__traits::is_deferred); + static_assert(!cuda::argument::__traits>::is_deferred); + static_assert(!cuda::argument::__traits>>::is_deferred); + static_assert(!cuda::argument::__traits>::is_deferred); #if TEST_HAS_CLASS_NTTP static_assert( - !cuda::__argument::__traits{1, 2, 3}>>::is_deferred); + !cuda::argument::__traits{1, 2, 3}>>::is_deferred); #endif // TEST_HAS_CLASS_NTTP - static_assert(cuda::__argument::__traits>>::is_deferred); - static_assert(cuda::__argument::__traits>>::is_deferred); + static_assert(cuda::argument::__traits>>::is_deferred); + static_assert(cuda::argument::__traits>>::is_deferred); // --- argument_traits: is_single_value --- - static_assert(cuda::__argument::__traits::is_single_value); - static_assert(!cuda::__argument::__traits::is_single_value); - static_assert(cuda::__argument::__traits>::is_single_value); - static_assert(cuda::__argument::__traits>::is_single_value); - static_assert( - cuda::__argument::__traits>>::is_single_value); - static_assert( - !cuda::__argument::__traits>>::is_single_value); - static_assert(cuda::__argument::__traits>::is_single_value); + static_assert(cuda::argument::__traits::is_single_value); + static_assert(!cuda::argument::__traits::is_single_value); + static_assert(cuda::argument::__traits>::is_single_value); + static_assert(cuda::argument::__traits>::is_single_value); + static_assert(cuda::argument::__traits>>::is_single_value); + static_assert(!cuda::argument::__traits>>::is_single_value); + static_assert(cuda::argument::__traits>::is_single_value); #if TEST_HAS_CLASS_NTTP - static_assert(!cuda::__argument::__traits< - cuda::__argument::__constant_sequence{1, 2, 3}>>::is_single_value); -#endif // TEST_HAS_CLASS_NTTP - static_assert(cuda::__argument::__traits>::is_single_value); static_assert( - !cuda::__argument::__traits>>::is_single_value); + !cuda::argument::__traits{1, 2, 3}>>::is_single_value); +#endif // TEST_HAS_CLASS_NTTP + static_assert(cuda::argument::__traits>::is_single_value); + static_assert(!cuda::argument::__traits>>::is_single_value); // --- argument_traits: value_type --- - static_assert(cuda::std::is_same_v::value_type, int>); - static_assert(cuda::std::is_same_v>::value_type, int>); - static_assert(cuda::std::is_same_v< - cuda::__argument::__traits>>::value_type, - cuda::std::span>); - static_assert(cuda::std::is_same_v>::value_type, int>); + static_assert(cuda::std::is_same_v::value_type, int>); + static_assert(cuda::std::is_same_v>::value_type, int>); + static_assert( + cuda::std::is_same_v>>::value_type, + cuda::std::span>); + static_assert(cuda::std::is_same_v>::value_type, int>); #if TEST_HAS_CLASS_NTTP static_assert( cuda::std::is_same_v< - cuda::__argument::__traits{1, 2, 3}>>::value_type, + cuda::argument::__traits{1, 2, 3}>>::value_type, cuda::std::array>); #endif // TEST_HAS_CLASS_NTTP // --- argument_traits: lowest / highest --- - static_assert(cuda::__argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__traits::highest == (cuda::std::numeric_limits::max)()); - static_assert(cuda::__argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__traits::highest == (cuda::std::numeric_limits::max)()); - static_assert(cuda::__argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__traits::highest == (cuda::std::numeric_limits::max)()); + static_assert(cuda::argument::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::argument::__traits::highest == (cuda::std::numeric_limits::max)()); + static_assert(cuda::argument::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::argument::__traits::highest == (cuda::std::numeric_limits::max)()); + static_assert(cuda::argument::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::argument::__traits::highest == (cuda::std::numeric_limits::max)()); static_assert( - cuda::__argument::__traits>>::lowest - == 1); + cuda::argument::__traits>>::lowest == 1); static_assert( - cuda::__argument::__traits>&>::highest - == 8); + cuda::argument::__traits>&>::highest == 8); static_assert( - cuda::__argument::__traits< - cuda::__argument::__immediate_sequence, cuda::__argument::__static_bounds<1, 8>>>::highest + cuda::argument::__traits< + cuda::argument::immediate_sequence, cuda::argument::__static_bounds<1, 8>>>::highest == 8); #if TEST_HAS_CLASS_NTTP static_assert( - cuda::__argument::__traits{3, 1, 2}>>::lowest == 1); + cuda::argument::__traits{3, 1, 2}>>::lowest == 1); static_assert( - cuda::__argument::__traits{3, 1, 2}>>::highest == 3); + cuda::argument::__traits{3, 1, 2}>>::highest == 3); #endif // TEST_HAS_CLASS_NTTP // --- Free function bounds on plain values --- - static_assert(cuda::__argument::__lowest_(42) == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__highest_(42) == (cuda::std::numeric_limits::max)()); - static_assert(cuda::__argument::__lowest_(1.0f) == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__highest_(1.0f) == (cuda::std::numeric_limits::max)()); + static_assert(cuda::argument::__lowest_(42) == cuda::std::numeric_limits::lowest()); + static_assert(cuda::argument::__highest_(42) == (cuda::std::numeric_limits::max)()); + static_assert(cuda::argument::__lowest_(1.0f) == cuda::std::numeric_limits::lowest()); + static_assert(cuda::argument::__highest_(1.0f) == (cuda::std::numeric_limits::max)()); // --- Scalar and sequence wrappers expose distinct single-value traits --- - static_assert(cuda::__argument::__traits>::is_single_value); - static_assert(cuda::__argument::__traits>::is_single_value); - static_assert( - !cuda::__argument::__traits>>::is_single_value); + static_assert(cuda::argument::__traits>::is_single_value); + static_assert(cuda::argument::__traits>::is_single_value); + static_assert(!cuda::argument::__traits>>::is_single_value); #if TEST_HAS_CLASS_NTTP - static_assert(!cuda::__argument::__traits< - cuda::__argument::__constant_sequence{1, 2, 3}>>::is_single_value); + static_assert( + !cuda::argument::__traits{1, 2, 3}>>::is_single_value); #endif // TEST_HAS_CLASS_NTTP } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp index 9949e0013b4..f9bda950aa7 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp @@ -8,7 +8,7 @@ // //===----------------------------------------------------------------------===// -#include +#include #include #include #include @@ -22,102 +22,110 @@ TEST_FUNC constexpr bool test() // Deferred single value via span { int val = 42; - auto def = cuda::__argument::__deferred{cuda::std::span{&val, 1}}; - assert(cuda::__argument::__unwrap(def)[0] == 42); - static_assert(cuda::__argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__traits::highest == (cuda::std::numeric_limits::max)()); + auto def = cuda::argument::deferred{cuda::std::span{&val, 1}}; + assert(cuda::argument::__unwrap(def)[0] == 42); + assert(def.__arg_[0] == 42); + static_assert(cuda::argument::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::argument::__traits::highest == (cuda::std::numeric_limits::max)()); } // Deferred single value with static bounds { int val = 42; - auto def = cuda::__argument::__deferred{cuda::std::span{&val, 1}, cuda::__argument::__bounds<1, 1000>()}; - assert(cuda::__argument::__unwrap(def)[0] == 42); - static_assert(cuda::__argument::__traits::lowest == 1); - static_assert(cuda::__argument::__traits::highest == 1000); + auto def = cuda::argument::deferred{cuda::std::span{&val, 1}, cuda::argument::bounds<1, 1000>()}; + assert(cuda::argument::__unwrap(def)[0] == 42); + static_assert(cuda::argument::__traits::lowest == 1); + static_assert(cuda::argument::__traits::highest == 1000); } // Deferred single value via pointer { int val = 42; - using def_t = cuda::__argument::__deferred>; - static_assert(cuda::__argument::__traits::lowest == 0); - static_assert(cuda::__argument::__traits::highest == 100); + using def_t = cuda::argument::deferred>; + static_assert(cuda::argument::__traits::lowest == 0); + static_assert(cuda::argument::__traits::highest == 100); // Also verify construction works - auto def = cuda::__argument::__deferred{&val, cuda::__argument::__bounds<0, 100>()}; - assert(cuda::__argument::__unwrap(def) == &val); + auto def = cuda::argument::deferred{&val, cuda::argument::bounds<0, 100>()}; + assert(cuda::argument::__unwrap(def) == &val); } // Deferred single value via fancy iterator { auto it = cuda::counting_iterator{42}; - auto def = cuda::__argument::__deferred{it, cuda::__argument::__bounds<0, 100>()}; - assert(cuda::__argument::__unwrap(def)[0] == 42); - static_assert(cuda::__argument::__traits::lowest == 0); - static_assert(cuda::__argument::__traits::highest == 100); - static_assert(cuda::__argument::__traits::is_single_value); + auto def = cuda::argument::deferred{it, cuda::argument::bounds<0, 100>()}; + assert(cuda::argument::__unwrap(def)[0] == 42); + static_assert(cuda::argument::__traits::lowest == 0); + static_assert(cuda::argument::__traits::highest == 100); + static_assert(cuda::argument::__traits::is_single_value); } // Deferred single value with both bounds, runtime bounds first { int val = 42; - auto def = cuda::__argument::__deferred{ - cuda::std::span{&val, 1}, cuda::__argument::__bounds(5, 100), cuda::__argument::__bounds<1, 256>()}; - static_assert(cuda::__argument::__traits::lowest == 1); - static_assert(cuda::__argument::__traits::highest == 256); - assert(cuda::__argument::__lowest_(def) == 5); - assert(cuda::__argument::__highest_(def) == 100); + auto def = cuda::argument::deferred{ + cuda::std::span{&val, 1}, cuda::argument::bounds(5, 100), cuda::argument::bounds<1, 256>()}; + static_assert(cuda::argument::__traits::lowest == 1); + static_assert(cuda::argument::__traits::highest == 256); + assert(def.__runtime_bounds_.__lower_ == 5); + assert(def.__runtime_bounds_.__upper_ == 100); + assert(cuda::argument::__lowest_(def) == 5); + assert(cuda::argument::__highest_(def) == 100); + def.__runtime_bounds_.__upper_ = 90; + assert(cuda::argument::__highest_(def) == 90); } // Deferred sequence via fancy iterator { auto it = cuda::counting_iterator{10}; - auto def = cuda::__argument::__deferred_sequence{it, cuda::__argument::__bounds<0, 100>()}; - assert(cuda::__argument::__unwrap(def)[0] == 10); - assert(cuda::__argument::__unwrap(def)[2] == 12); - static_assert(cuda::__argument::__traits::lowest == 0); - static_assert(cuda::__argument::__traits::highest == 100); - static_assert(!cuda::__argument::__traits::is_single_value); + auto def = cuda::argument::deferred_sequence{it, cuda::argument::bounds<0, 100>()}; + assert(cuda::argument::__unwrap(def)[0] == 10); + assert(cuda::argument::__unwrap(def)[2] == 12); + static_assert(cuda::argument::__traits::lowest == 0); + static_assert(cuda::argument::__traits::highest == 100); + static_assert(!cuda::argument::__traits::is_single_value); } // Deferred sequence with both bounds { int arr[4] = {10, 20, 30, 40}; - auto def = cuda::__argument::__deferred_sequence{ - cuda::std::span{arr, 4}, cuda::__argument::__bounds<1, 4096>(), cuda::__argument::__bounds(5, 100)}; - static_assert(cuda::__argument::__traits::lowest == 1); - assert(cuda::__argument::__lowest_(def) == 5); - assert(cuda::__argument::__highest_(def) == 100); + auto def = cuda::argument::deferred_sequence{ + cuda::std::span{arr, 4}, cuda::argument::bounds<1, 4096>(), cuda::argument::bounds(5, 100)}; + assert(def.__arg_.size() == 4); + assert(def.__runtime_bounds_.__lower_ == 5); + assert(def.__runtime_bounds_.__upper_ == 100); + static_assert(cuda::argument::__traits::lowest == 1); + assert(cuda::argument::__lowest_(def) == 5); + assert(cuda::argument::__highest_(def) == 100); } // Deferred sequence with both bounds, runtime bounds first { int arr[4] = {10, 20, 30, 40}; - auto def = cuda::__argument::__deferred_sequence{ - cuda::std::span{arr, 4}, cuda::__argument::__bounds(5, 100), cuda::__argument::__bounds<1, 4096>()}; - static_assert(cuda::__argument::__traits::lowest == 1); - static_assert(cuda::__argument::__traits::highest == 4096); - assert(cuda::__argument::__lowest_(def) == 5); - assert(cuda::__argument::__highest_(def) == 100); + auto def = cuda::argument::deferred_sequence{ + cuda::std::span{arr, 4}, cuda::argument::bounds(5, 100), cuda::argument::bounds<1, 4096>()}; + static_assert(cuda::argument::__traits::lowest == 1); + static_assert(cuda::argument::__traits::highest == 4096); + assert(cuda::argument::__lowest_(def) == 5); + assert(cuda::argument::__highest_(def) == 100); } // Traits: deferred is single value { - using traits = cuda::__argument::__traits>>; + using traits = cuda::argument::__traits>>; static_assert(traits::is_deferred); static_assert(traits::is_single_value); } // Traits: deferred with pointer is also single value { - using traits = cuda::__argument::__traits>; + using traits = cuda::argument::__traits>; static_assert(traits::is_deferred); static_assert(traits::is_single_value); } // Traits: deferred_sequence is not single value { - using traits = cuda::__argument::__traits>>; + using traits = cuda::argument::__traits>>; static_assert(traits::is_deferred); static_assert(!traits::is_single_value); } @@ -125,16 +133,16 @@ TEST_FUNC constexpr bool test() // Unwrap: deferred { int val = 99; - auto def = cuda::__argument::__deferred{cuda::std::span{&val, 1}}; - auto& v = cuda::__argument::__unwrap(def); + auto def = cuda::argument::deferred{cuda::std::span{&val, 1}}; + auto& v = cuda::argument::__unwrap(def); assert(v[0] == 99); } // Unwrap: deferred_sequence { int arr[3] = {10, 20, 30}; - auto def = cuda::__argument::__deferred_sequence{cuda::std::span{arr, 3}}; - const auto& v = cuda::__argument::__unwrap(def); + auto def = cuda::argument::deferred_sequence{cuda::std::span{arr, 3}}; + const auto& v = cuda::argument::__unwrap(def); assert(v.size() == 3); assert(v[1] == 20); } @@ -142,14 +150,14 @@ TEST_FUNC constexpr bool test() // Unwrap: rvalue deferred returns by value { int val = 99; - auto v = cuda::__argument::__unwrap(cuda::__argument::__deferred{cuda::std::span{&val, 1}}); + auto v = cuda::argument::__unwrap(cuda::argument::deferred{cuda::std::span{&val, 1}}); assert(v[0] == 99); } // Unwrap: rvalue deferred_sequence returns by value { int arr[3] = {10, 20, 30}; - auto v = cuda::__argument::__unwrap(cuda::__argument::__deferred_sequence{cuda::std::span{arr, 3}}); + auto v = cuda::argument::__unwrap(cuda::argument::deferred_sequence{cuda::std::span{arr, 3}}); assert(v.size() == 3); assert(v[2] == 30); } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp index eac5e71aa82..23e867be616 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp @@ -8,7 +8,7 @@ // //===----------------------------------------------------------------------===// -#include +#include #include #include #include @@ -25,93 +25,99 @@ TEST_FUNC constexpr bool test() { // Uniform scalar via CTAD { - auto da = cuda::__argument::__immediate{5}; - assert(cuda::__argument::__unwrap(da) == 5); - static_assert(cuda::__argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__traits::highest == (cuda::std::numeric_limits::max)()); - assert(cuda::__argument::__lowest_(da) == 5); - assert(cuda::__argument::__highest_(da) == 5); + auto da = cuda::argument::immediate{5}; + assert(cuda::argument::__unwrap(da) == 5); + assert(da.__arg_ == 5); + static_assert(cuda::argument::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::argument::__traits::highest == (cuda::std::numeric_limits::max)()); + assert(cuda::argument::__lowest_(da) == 5); + assert(cuda::argument::__highest_(da) == 5); + da.__arg_ = 6; + assert(cuda::argument::__unwrap(da) == 6); } // Uniform scalar with static bounds { - auto da = cuda::__argument::__immediate{5, cuda::__argument::__bounds<1, 8>()}; - assert(cuda::__argument::__unwrap(da) == 5); - static_assert(cuda::__argument::__traits::lowest == 1); - static_assert(cuda::__argument::__traits::highest == 8); - assert(cuda::__argument::__lowest_(da) == 5); - assert(cuda::__argument::__highest_(da) == 5); + auto da = cuda::argument::immediate{5, cuda::argument::bounds<1, 8>()}; + assert(cuda::argument::__unwrap(da) == 5); + static_assert(cuda::argument::__traits::lowest == 1); + static_assert(cuda::argument::__traits::highest == 8); + assert(cuda::argument::__lowest_(da) == 5); + assert(cuda::argument::__highest_(da) == 5); } // Non-sequence values are accepted without scalar-only restrictions { - auto da = cuda::__argument::__immediate{non_sequence_value{7}}; - assert(cuda::__argument::__unwrap(da).payload == 7); + auto da = cuda::argument::immediate{non_sequence_value{7}}; + assert(cuda::argument::__unwrap(da).payload == 7); } // Pointer-like types can still represent a single value when explicitly wrapped that way { int value = 11; - auto da = cuda::__argument::__immediate{&value}; - static_assert(cuda::__argument::__traits::is_single_value); - assert(*cuda::__argument::__unwrap(da) == 11); + auto da = cuda::argument::immediate{&value}; + static_assert(cuda::argument::__traits::is_single_value); + assert(*cuda::argument::__unwrap(da) == 11); } // Per-segment span with runtime bounds { int arr[4] = {10, 20, 30, 40}; - auto da = - cuda::__argument::__immediate_sequence{cuda::std::span{arr, 4}, cuda::__argument::__bounds(1L, 100L)}; - assert(cuda::__argument::__unwrap(da).size() == 4); - assert(cuda::__argument::__lowest_(da) == 1); - assert(cuda::__argument::__highest_(da) == 100); + auto da = cuda::argument::immediate_sequence{cuda::std::span{arr, 4}, cuda::argument::bounds(1L, 100L)}; + assert(cuda::argument::__unwrap(da).size() == 4); + assert(da.__arg_.size() == 4); + assert(da.__runtime_bounds_.__lower_ == 1); + assert(da.__runtime_bounds_.__upper_ == 100); + assert(cuda::argument::__lowest_(da) == 1); + assert(cuda::argument::__highest_(da) == 100); + da.__runtime_bounds_.__upper_ = 90; + assert(cuda::argument::__highest_(da) == 90); } // Per-segment span with both bounds { int arr[4] = {10, 20, 30, 40}; - auto da = cuda::__argument::__immediate_sequence{ - cuda::std::span{arr, 4}, cuda::__argument::__bounds<1, 256>(), cuda::__argument::__bounds(10, 200)}; - static_assert(cuda::__argument::__traits::lowest == 1); - static_assert(cuda::__argument::__traits::highest == 256); - assert(cuda::__argument::__lowest_(da) == 10); - assert(cuda::__argument::__highest_(da) == 200); + auto da = cuda::argument::immediate_sequence{ + cuda::std::span{arr, 4}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(10, 200)}; + static_assert(cuda::argument::__traits::lowest == 1); + static_assert(cuda::argument::__traits::highest == 256); + assert(cuda::argument::__lowest_(da) == 10); + assert(cuda::argument::__highest_(da) == 200); } // Per-segment span with both bounds, runtime bounds first { int arr[4] = {10, 20, 30, 40}; - auto da = cuda::__argument::__immediate_sequence{ - cuda::std::span{arr, 4}, cuda::__argument::__bounds(10, 200), cuda::__argument::__bounds<1, 256>()}; - static_assert(cuda::__argument::__traits::lowest == 1); - static_assert(cuda::__argument::__traits::highest == 256); - assert(cuda::__argument::__lowest_(da) == 10); - assert(cuda::__argument::__highest_(da) == 200); + auto da = cuda::argument::immediate_sequence{ + cuda::std::span{arr, 4}, cuda::argument::bounds(10, 200), cuda::argument::bounds<1, 256>()}; + static_assert(cuda::argument::__traits::lowest == 1); + static_assert(cuda::argument::__traits::highest == 256); + assert(cuda::argument::__lowest_(da) == 10); + assert(cuda::argument::__highest_(da) == 200); } // Per-segment via span { int arr[4] = {1, 2, 3, 4}; - auto da = cuda::__argument::__immediate_sequence{cuda::std::span{arr, 4}}; - assert(cuda::__argument::__unwrap(da).size() == 4); - assert(cuda::__argument::__unwrap(da)[0] == 1); - assert(cuda::__argument::__unwrap(da)[3] == 4); + auto da = cuda::argument::immediate_sequence{cuda::std::span{arr, 4}}; + assert(cuda::argument::__unwrap(da).size() == 4); + assert(cuda::argument::__unwrap(da)[0] == 1); + assert(cuda::argument::__unwrap(da)[3] == 4); } // Per-segment with static bounds { int arr[4] = {10, 20, 30, 40}; - auto da = - cuda::__argument::__immediate_sequence{cuda::std::span{arr, 4}, cuda::__argument::__bounds<1, 100>()}; - assert(cuda::__argument::__unwrap(da).size() == 4); - assert(cuda::__argument::__unwrap(da)[2] == 30); - static_assert(cuda::__argument::__traits::lowest == 1); - static_assert(cuda::__argument::__traits::highest == 100); + auto da = cuda::argument::immediate_sequence{cuda::std::span{arr, 4}, cuda::argument::bounds<1, 100>()}; + assert(cuda::argument::__unwrap(da).size() == 4); + assert(cuda::argument::__unwrap(da)[2] == 30); + static_assert(cuda::argument::__traits::lowest == 1); + static_assert(cuda::argument::__traits::highest == 100); } // Traits { - using traits = cuda::__argument::__traits>; + using traits = cuda::argument::__traits>; static_assert(!traits::is_deferred); static_assert(traits::is_single_value); static_assert(cuda::std::is_same_v); @@ -119,7 +125,7 @@ TEST_FUNC constexpr bool test() // Sequence traits { - using traits = cuda::__argument::__traits>>; + using traits = cuda::argument::__traits>>; static_assert(!traits::is_deferred); static_assert(!traits::is_single_value); static_assert(cuda::std::is_same_v>); @@ -128,39 +134,38 @@ TEST_FUNC constexpr bool test() // __is_single_value_v on unwrapped types { static_assert( - cuda::__argument::__is_single_value_v>::value_type>); - static_assert( - !cuda::__argument::__traits>>::is_single_value); + cuda::argument::__is_single_value_v>::value_type>); + static_assert(!cuda::argument::__traits>>::is_single_value); } // Unwrap: scalar { - auto da = cuda::__argument::__immediate{7}; - auto& v = cuda::__argument::__unwrap(da); + auto da = cuda::argument::immediate{7}; + auto& v = cuda::argument::__unwrap(da); assert(v == 7); v = 8; - assert(cuda::__argument::__unwrap(da) == 8); + assert(cuda::argument::__unwrap(da) == 8); } // Unwrap: span { int arr[3] = {10, 20, 30}; - auto da = cuda::__argument::__immediate_sequence{cuda::std::span{arr, 3}}; - const auto& v = cuda::__argument::__unwrap(da); + auto da = cuda::argument::immediate_sequence{cuda::std::span{arr, 3}}; + const auto& v = cuda::argument::__unwrap(da); assert(v.size() == 3); assert(v[1] == 20); } // Unwrap: rvalue scalar returns by value { - const auto& v = cuda::__argument::__unwrap(cuda::__argument::__immediate{7}); + const auto& v = cuda::argument::__unwrap(cuda::argument::immediate{7}); assert(v == 7); } // Unwrap: rvalue span returns by value { int arr[3] = {10, 20, 30}; - auto v = cuda::__argument::__unwrap(cuda::__argument::__immediate_sequence{cuda::std::span{arr, 3}}); + auto v = cuda::argument::__unwrap(cuda::argument::immediate_sequence{cuda::std::span{arr, 3}}); assert(v.size() == 3); assert(v[2] == 30); } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp index 7d40183070b..485a17fe72b 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp @@ -8,7 +8,7 @@ // //===----------------------------------------------------------------------===// -#include +#include #include #include #include @@ -24,37 +24,37 @@ TEST_FUNC void test() { // Basic value { - constexpr auto sa = cuda::__argument::__constant<42>{}; + constexpr auto sa = cuda::argument::constant<42>{}; static_assert(sa.value() == 42); static_assert(cuda::std::is_same_v); } // Different types { - constexpr auto sa_long = cuda::__argument::__constant<100L>{}; + constexpr auto sa_long = cuda::argument::constant<100L>{}; static_assert(sa_long.value() == 100L); static_assert(cuda::std::is_same_v); } // Negative value { - constexpr auto sa_neg = cuda::__argument::__constant<-1>{}; + constexpr auto sa_neg = cuda::argument::constant<-1>{}; static_assert(sa_neg.value() == -1); } #if TEST_HAS_CLASS_NTTP // Non-sequence values are accepted without scalar-only restrictions { - constexpr auto sa = cuda::__argument::__constant{}; + constexpr auto sa = cuda::argument::constant{}; static_assert(sa.value().payload == 7); - static_assert(cuda::__argument::__unwrap(sa).payload == 7); + static_assert(cuda::argument::__unwrap(sa).payload == 7); } #endif // TEST_HAS_CLASS_NTTP #if TEST_HAS_CLASS_NTTP // Array sequence { - constexpr auto sa_arr = cuda::__argument::__constant_sequence{128, 256, 512}>{}; + constexpr auto sa_arr = cuda::argument::constant_sequence{128, 256, 512}>{}; static_assert(sa_arr.value()[0] == 128); static_assert(sa_arr.value()[1] == 256); static_assert(sa_arr.value()[2] == 512); @@ -64,32 +64,32 @@ TEST_FUNC void test() // Bounds: scalar { - constexpr auto sa = cuda::__argument::__constant<42>{}; - static_assert(cuda::__argument::__lowest_(sa) == 42); - static_assert(cuda::__argument::__highest_(sa) == 42); + constexpr auto sa = cuda::argument::constant<42>{}; + static_assert(cuda::argument::__lowest_(sa) == 42); + static_assert(cuda::argument::__highest_(sa) == 42); } #if TEST_HAS_CLASS_NTTP // Bounds: array sequence computes lowest/highest of elements { - constexpr auto sa = cuda::__argument::__constant_sequence{128, 256, 512}>{}; - static_assert(cuda::__argument::__lowest_(sa) == 128); - static_assert(cuda::__argument::__highest_(sa) == 512); + constexpr auto sa = cuda::argument::constant_sequence{128, 256, 512}>{}; + static_assert(cuda::argument::__lowest_(sa) == 128); + static_assert(cuda::argument::__highest_(sa) == 512); } #endif // TEST_HAS_CLASS_NTTP #if TEST_HAS_CLASS_NTTP // Bounds: empty array sequence has unconstrained element bounds { - constexpr auto sa = cuda::__argument::__constant_sequence{}>{}; - static_assert(cuda::__argument::__lowest_(sa) == cuda::std::numeric_limits::lowest()); - static_assert(cuda::__argument::__highest_(sa) == (cuda::std::numeric_limits::max)()); + constexpr auto sa = cuda::argument::constant_sequence{}>{}; + static_assert(cuda::argument::__lowest_(sa) == cuda::std::numeric_limits::lowest()); + static_assert(cuda::argument::__highest_(sa) == (cuda::std::numeric_limits::max)()); } #endif // TEST_HAS_CLASS_NTTP // Traits { - using traits = cuda::__argument::__traits>; + using traits = cuda::argument::__traits>; static_assert(!traits::is_deferred); static_assert(traits::is_constant); static_assert(traits::is_single_value); @@ -101,7 +101,7 @@ TEST_FUNC void test() #if TEST_HAS_CLASS_NTTP // Sequence traits { - using traits = cuda::__argument::__traits{1, 2, 3}>>; + using traits = cuda::argument::__traits{1, 2, 3}>>; static_assert(traits::is_constant); static_assert(!traits::is_deferred); static_assert(!traits::is_single_value); @@ -113,25 +113,25 @@ TEST_FUNC void test() // Single value: scalar is single, sequence is not { static_assert( - cuda::__argument::__is_single_value_v>::value_type>); + cuda::argument::__is_single_value_v>::value_type>); #if TEST_HAS_CLASS_NTTP - static_assert(!cuda::__argument::__traits< - cuda::__argument::__constant_sequence{1, 2, 3}>>::is_single_value); + static_assert( + !cuda::argument::__traits{1, 2, 3}>>::is_single_value); #endif // TEST_HAS_CLASS_NTTP } // Unwrap: scalar { - constexpr auto sa = cuda::__argument::__constant<42>{}; - constexpr auto val = cuda::__argument::__unwrap(sa); + constexpr auto sa = cuda::argument::constant<42>{}; + constexpr auto val = cuda::argument::__unwrap(sa); static_assert(val == 42); } #if TEST_HAS_CLASS_NTTP // Unwrap: sequence { - constexpr auto sa = cuda::__argument::__constant_sequence{10, 20, 30}>{}; - constexpr auto val = cuda::__argument::__unwrap(sa); + constexpr auto sa = cuda::argument::constant_sequence{10, 20, 30}>{}; + constexpr auto val = cuda::argument::__unwrap(sa); static_assert(val[0] == 10); static_assert(val[2] == 30); } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp index 19e475ef453..79a96a3366e 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp @@ -8,11 +8,11 @@ // //===----------------------------------------------------------------------===// -#include +#include -using arg_t = cuda::__argument::__immediate>; +using arg_t = cuda::argument::immediate>; -[[maybe_unused]] constexpr auto invalid_highest = cuda::__argument::__traits::highest; +[[maybe_unused]] constexpr auto invalid_highest = cuda::argument::__traits::highest; int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp index 8d240e0cf3e..d3a0499fc55 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp @@ -8,9 +8,9 @@ // //===----------------------------------------------------------------------===// -#include +#include -[[maybe_unused]] constexpr auto invalid_bounds = cuda::__argument::__static_bounds<0, 1L>{}; +[[maybe_unused]] constexpr auto invalid_bounds = cuda::argument::__static_bounds<0, 1L>{}; int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp index e3a752b780a..f449975cb06 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp @@ -13,7 +13,7 @@ // All argument types (plain values, static, dynamic, deferred) work uniformly // through the free functions. -#include +#include #include #include #include @@ -34,7 +34,7 @@ enum class algorithm_variant template TEST_FUNC constexpr algorithm_variant select_variant(_SegSizeArg) { - if constexpr (cuda::__argument::__traits<_SegSizeArg>::highest <= shared_memory_capacity) + if constexpr (cuda::argument::__traits<_SegSizeArg>::highest <= shared_memory_capacity) { return algorithm_variant::shared_memory; } @@ -48,7 +48,7 @@ TEST_FUNC constexpr algorithm_variant select_variant(_SegSizeArg) template TEST_FUNC constexpr int compute_buffer_size(_SegSizeArg __seg_size, int __num_segments) { - auto __highest = cuda::std::min(default_max_segment_size, static_cast(cuda::__argument::__highest_(__seg_size))); + auto __highest = cuda::std::min(default_max_segment_size, static_cast(cuda::argument::__highest_(__seg_size))); return __highest * __num_segments; } @@ -56,9 +56,9 @@ TEST_FUNC constexpr int compute_buffer_size(_SegSizeArg __seg_size, int __num_se template TEST_FUNC constexpr int process_segments(_SegSizeArg __seg_size) { - const auto& __val = cuda::__argument::__unwrap(__seg_size); + const auto& __val = cuda::argument::__unwrap(__seg_size); - if constexpr (cuda::__argument::__traits<_SegSizeArg>::is_single_value) + if constexpr (cuda::argument::__traits<_SegSizeArg>::is_single_value) { return static_cast(__val); } @@ -93,7 +93,7 @@ TEST_FUNC constexpr bool test() // static_argument: scalar, fits in shared memory, buffer = value { - constexpr auto seg_size = cuda::__argument::__constant<128>{}; + constexpr auto seg_size = cuda::argument::constant<128>{}; static_assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_size, 4) == 128 * 4); assert(process_segments(seg_size) == 128); @@ -102,7 +102,7 @@ TEST_FUNC constexpr bool test() #if TEST_HAS_CLASS_NTTP // static_argument: array sequence, highest fits in shared memory { - constexpr auto seg_sizes = cuda::__argument::__constant_sequence{}; + constexpr auto seg_sizes = cuda::argument::constant_sequence{}; static_assert(select_variant(seg_sizes) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_sizes, 3) == 256 * 3); assert(process_segments(seg_sizes) == 64 + 128 + 256); @@ -110,7 +110,7 @@ TEST_FUNC constexpr bool test() // static_argument: array sequence, highest exceeds shared memory, buffer clamped { - constexpr auto seg_sizes = cuda::__argument::__constant_sequence{}; + constexpr auto seg_sizes = cuda::argument::constant_sequence{}; static_assert(select_variant(seg_sizes) == algorithm_variant::global_memory); assert(compute_buffer_size(seg_sizes, 3) == 512 * 3); assert(process_segments(seg_sizes) == 64 + 128 + 512); @@ -119,7 +119,7 @@ TEST_FUNC constexpr bool test() // dynamic_argument: tight static bounds, shared memory, buffer = value { - constexpr auto seg_size = cuda::__argument::__immediate{100, cuda::__argument::__bounds<1, 256>()}; + constexpr auto seg_size = cuda::argument::immediate{100, cuda::argument::bounds<1, 256>()}; static_assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_size, 4) == 100 * 4); assert(process_segments(seg_size) == 100); @@ -127,7 +127,7 @@ TEST_FUNC constexpr bool test() // dynamic_argument: wide static bounds, global memory, buffer = value { - constexpr auto seg_size = cuda::__argument::__immediate{100, cuda::__argument::__bounds<1, 4096>()}; + constexpr auto seg_size = cuda::argument::immediate{100, cuda::argument::bounds<1, 4096>()}; static_assert(select_variant(seg_size) == algorithm_variant::global_memory); assert(compute_buffer_size(seg_size, 4) == 100 * 4); assert(process_segments(seg_size) == 100); @@ -135,7 +135,7 @@ TEST_FUNC constexpr bool test() // dynamic_argument: no bounds, global memory, buffer = value { - constexpr auto seg_size = cuda::__argument::__immediate{100}; + constexpr auto seg_size = cuda::argument::immediate{100}; static_assert(select_variant(seg_size) == algorithm_variant::global_memory); assert(compute_buffer_size(seg_size, 4) == 100 * 4); assert(process_segments(seg_size) == 100); @@ -143,9 +143,8 @@ TEST_FUNC constexpr bool test() // dynamic_argument: per-segment span with runtime bounds only { - int sizes[3] = {64, 128, 96}; - auto seg_sizes = - cuda::__argument::__immediate_sequence{cuda::std::span{sizes, 3}, cuda::__argument::__bounds(1, 200)}; + int sizes[3] = {64, 128, 96}; + auto seg_sizes = cuda::argument::immediate_sequence{cuda::std::span{sizes, 3}, cuda::argument::bounds(1, 200)}; assert(select_variant(seg_sizes) == algorithm_variant::global_memory); assert(compute_buffer_size(seg_sizes, 3) == 200 * 3); assert(process_segments(seg_sizes) == 64 + 128 + 96); @@ -154,9 +153,9 @@ TEST_FUNC constexpr bool test() // dynamic_argument: per-segment span with both bounds { int sizes[3] = {64, 128, 96}; - auto seg_sizes = cuda::__argument::__immediate_sequence{ - cuda::std::span{sizes, 3}, cuda::__argument::__bounds<1, 256>(), cuda::__argument::__bounds(1, 200)}; - static_assert(cuda::__argument::__traits::highest <= shared_memory_capacity); + auto seg_sizes = cuda::argument::immediate_sequence{ + cuda::std::span{sizes, 3}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(1, 200)}; + static_assert(cuda::argument::__traits::highest <= shared_memory_capacity); assert(select_variant(seg_sizes) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_sizes, 3) == 200 * 3); assert(process_segments(seg_sizes) == 64 + 128 + 96); @@ -165,9 +164,9 @@ TEST_FUNC constexpr bool test() // deferred_argument: uniform, bounds for decisions only { int val = 100; - auto seg_size = cuda::__argument::__deferred{ - cuda::std::span{&val, 1}, cuda::__argument::__bounds<1, 256>(), cuda::__argument::__bounds(1, 200)}; - static_assert(cuda::__argument::__traits::highest <= shared_memory_capacity); + auto seg_size = cuda::argument::deferred{ + cuda::std::span{&val, 1}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(1, 200)}; + static_assert(cuda::argument::__traits::highest <= shared_memory_capacity); assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_size, 4) == 200 * 4); } @@ -183,14 +182,14 @@ TEST_FUNC constexpr bool test() #if TEST_HAS_CLASS_NTTP // static_argument float (float NTTPs require C++20) { - constexpr auto seg_size = cuda::__argument::__constant<128.0f>{}; + constexpr auto seg_size = cuda::argument::constant<128.0f>{}; static_assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(process_segments(seg_size) == 128); } // dynamic_argument float with static bounds { - constexpr auto seg_size = cuda::__argument::__immediate{100.0f, cuda::__argument::__bounds<1.0f, 256.0f>()}; + constexpr auto seg_size = cuda::argument::immediate{100.0f, cuda::argument::bounds<1.0f, 256.0f>()}; static_assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(process_segments(seg_size) == 100); } From 3210e3d096dd515e8143d1d9c18512b0cf225832 Mon Sep 17 00:00:00 2001 From: Piotr Ciolkosz Date: Wed, 3 Jun 2026 23:35:26 -0700 Subject: [PATCH 2/5] Forgot about the bounds types --- libcudacxx/include/cuda/__argument/argument.h | 87 +++++++++---------- .../include/cuda/__argument/argument_bounds.h | 20 ++--- .../cuda/argument/argument_bounds.pass.cpp | 16 ++-- .../cuda/argument/argument_traits.pass.cpp | 6 +- .../cuda/argument/deferred_argument.pass.cpp | 2 +- .../static_bounds_conversion.fail.cpp | 2 +- .../static_bounds_type_mismatch.fail.cpp | 2 +- 7 files changed, 67 insertions(+), 68 deletions(-) diff --git a/libcudacxx/include/cuda/__argument/argument.h b/libcudacxx/include/cuda/__argument/argument.h index 42d66233c6e..035e0cdcf2d 100644 --- a/libcudacxx/include/cuda/__argument/argument.h +++ b/libcudacxx/include/cuda/__argument/argument.h @@ -174,13 +174,13 @@ template inline constexpr bool __valid_static_bounds_v = true; template -inline constexpr bool __valid_static_bounds_v<_ElementType, __static_bounds<_Lowest, _Highest>> = +inline constexpr bool __valid_static_bounds_v<_ElementType, static_bounds<_Lowest, _Highest>> = __static_bound_in_range<_ElementType, _Lowest>() && __static_bound_in_range<_ElementType, _Highest>(); template _CCCL_API constexpr _ElementType __wrapper_static_lowest() noexcept { - if constexpr (::cuda::std::is_same_v<_StaticBounds, __no_bounds>) + if constexpr (::cuda::std::is_same_v<_StaticBounds, no_bounds>) { return ::cuda::std::numeric_limits<_ElementType>::lowest(); } @@ -193,7 +193,7 @@ _CCCL_API constexpr _ElementType __wrapper_static_lowest() noexcept template _CCCL_API constexpr _ElementType __wrapper_static_highest() noexcept { - if constexpr (::cuda::std::is_same_v<_StaticBounds, __no_bounds>) + if constexpr (::cuda::std::is_same_v<_StaticBounds, no_bounds>) { return (::cuda::std::numeric_limits<_ElementType>::max)(); } @@ -204,28 +204,28 @@ _CCCL_API constexpr _ElementType __wrapper_static_highest() noexcept } template -_CCCL_API constexpr _ElementType __effective_lowest(__runtime_bounds<_ElementType> __runtime_bounds) noexcept +_CCCL_API constexpr _ElementType __effective_lowest(runtime_bounds<_ElementType> __runtime_bounds) noexcept { auto __static_lowest = __wrapper_static_lowest<_ElementType, _StaticBounds>(); return __static_lowest > __runtime_bounds.lower() ? __static_lowest : __runtime_bounds.lower(); } template -_CCCL_API constexpr _ElementType __effective_highest(__runtime_bounds<_ElementType> __runtime_bounds) noexcept +_CCCL_API constexpr _ElementType __effective_highest(runtime_bounds<_ElementType> __runtime_bounds) noexcept { auto __static_highest = __wrapper_static_highest<_ElementType, _StaticBounds>(); return __static_highest < __runtime_bounds.upper() ? __static_highest : __runtime_bounds.upper(); } template -_CCCL_API constexpr bool __has_bounds_intersection(__runtime_bounds<_ElementType> __runtime_bounds) noexcept +_CCCL_API constexpr bool __has_bounds_intersection(runtime_bounds<_ElementType> __runtime_bounds) noexcept { return __effective_lowest<_ElementType, _StaticBounds>(__runtime_bounds) <= __effective_highest<_ElementType, _StaticBounds>(__runtime_bounds); } template -_CCCL_API constexpr void __validate_bounds_intersection(__runtime_bounds<_ElementType> __runtime_bounds) noexcept +_CCCL_API constexpr void __validate_bounds_intersection(runtime_bounds<_ElementType> __runtime_bounds) noexcept { static_assert(__valid_static_bounds_v<_ElementType, _StaticBounds>, "static argument bounds cannot be represented by the element type"); @@ -236,7 +236,7 @@ _CCCL_API constexpr void __validate_bounds_intersection(__runtime_bounds<_Elemen template _CCCL_API constexpr void __validate_static_element_bounds([[maybe_unused]] const _ElementType& __val) noexcept { - if constexpr (!::cuda::std::is_same_v<_StaticBounds, __no_bounds>) + if constexpr (!::cuda::std::is_same_v<_StaticBounds, no_bounds>) { _CCCL_ASSERT((__val >= __wrapper_static_lowest<_ElementType, _StaticBounds>()), "immediate argument value is below static lowest bound"); @@ -247,7 +247,7 @@ _CCCL_API constexpr void __validate_static_element_bounds([[maybe_unused]] const template _CCCL_API constexpr void __validate_runtime_element_bounds( - [[maybe_unused]] const _ElementType& __val, [[maybe_unused]] __runtime_bounds<_ElementType> __runtime_bounds) noexcept + [[maybe_unused]] const _ElementType& __val, [[maybe_unused]] runtime_bounds<_ElementType> __runtime_bounds) noexcept { _CCCL_ASSERT((__val >= __runtime_bounds.lower()), "immediate argument value is below runtime lower bound"); _CCCL_ASSERT((__val <= __runtime_bounds.upper()), "immediate argument value is above runtime upper bound"); @@ -260,7 +260,7 @@ _CCCL_API constexpr void __validate_runtime_element_bounds( //! @brief Wraps a runtime argument value with optional bounds. //! //! The value is host-accessible at API call time. -template +template struct immediate { using __element_type = __element_type_of_t<_Arg>; @@ -296,8 +296,8 @@ struct immediate #ifndef _CCCL_DOXYGEN_INVOKED template -_CCCL_HOST_DEVICE immediate(_Arg, __static_bounds<_Lowest, _Highest>) - -> immediate<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE immediate(_Arg, static_bounds<_Lowest, _Highest>) + -> immediate<_Arg, static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED @@ -306,7 +306,7 @@ _CCCL_HOST_DEVICE immediate(_Arg, __static_bounds<_Lowest, _Highest>) // ===================================================================== //! @brief Wraps a runtime argument sequence with optional bounds. -template +template struct immediate_sequence { using __element_type = __element_type_of_t<_Arg>; @@ -316,7 +316,7 @@ struct immediate_sequence "static argument bounds cannot be represented by the element type"); _Arg __arg_; - __runtime_bounds<__element_type> __runtime_bounds_{}; + runtime_bounds<__element_type> __runtime_bounds_{}; private: _CCCL_API constexpr void __validate_bounds() const noexcept @@ -357,7 +357,7 @@ struct immediate_sequence } template - _CCCL_API constexpr immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb) noexcept + _CCCL_API constexpr immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb) noexcept : __arg_{::cuda::std::move(__arg)} , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()), __runtime_bound_cast<__element_type>(__rb.upper())} @@ -367,7 +367,7 @@ struct immediate_sequence } template - _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds, __runtime_bounds<_BoundsTp> __rb) noexcept + _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds, runtime_bounds<_BoundsTp> __rb) noexcept : __arg_{::cuda::std::move(__arg)} , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()), __runtime_bound_cast<__element_type>(__rb.upper())} @@ -377,23 +377,23 @@ struct immediate_sequence } template - _CCCL_API constexpr immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept + _CCCL_API constexpr immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept : immediate_sequence(::cuda::std::move(__arg), __sb, __rb) {} }; #ifndef _CCCL_DOXYGEN_INVOKED template -_CCCL_HOST_DEVICE immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>) - -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>) + -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>) - -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>, runtime_bounds<_Tp>) + -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE immediate_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>) - -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE immediate_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Highest>) + -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED // ===================================================================== @@ -401,7 +401,7 @@ _CCCL_HOST_DEVICE immediate_sequence(_Arg, __runtime_bounds<_Tp>, __static_bound // ===================================================================== //! @brief Common base for deferred argument wrappers. -template +template struct __deferred_base { using __element_type = __element_type_of_t<_Arg>; @@ -410,7 +410,7 @@ struct __deferred_base "static argument bounds cannot be represented by the element type"); _Arg __arg_; - __runtime_bounds<__element_type> __runtime_bounds_{}; + runtime_bounds<__element_type> __runtime_bounds_{}; _CCCL_API constexpr __deferred_base(_Arg __arg) noexcept : __arg_{::cuda::std::move(__arg)} @@ -425,7 +425,7 @@ struct __deferred_base } template - _CCCL_API constexpr __deferred_base(_Arg __arg, __runtime_bounds<_BoundsTp> __rb) noexcept + _CCCL_API constexpr __deferred_base(_Arg __arg, runtime_bounds<_BoundsTp> __rb) noexcept : __arg_{::cuda::std::move(__arg)} , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()), __runtime_bound_cast<__element_type>(__rb.upper())} @@ -434,7 +434,7 @@ struct __deferred_base } template - _CCCL_API constexpr __deferred_base(_Arg __arg, _StaticBounds, __runtime_bounds<_BoundsTp> __rb) noexcept + _CCCL_API constexpr __deferred_base(_Arg __arg, _StaticBounds, runtime_bounds<_BoundsTp> __rb) noexcept : __arg_{::cuda::std::move(__arg)} , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()), __runtime_bound_cast<__element_type>(__rb.upper())} @@ -443,14 +443,14 @@ struct __deferred_base } template - _CCCL_API constexpr __deferred_base(_Arg __arg, __runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept + _CCCL_API constexpr __deferred_base(_Arg __arg, runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept : __deferred_base(::cuda::std::move(__arg), __sb, __rb) {} }; //! @brief Wraps a reference to a single value that is potentially not available at API call time but will be available //! by the time the argument is consumed in stream order. -template +template struct deferred : __deferred_base<_Arg, _StaticBounds> { using __deferred_base<_Arg, _StaticBounds>::__deferred_base; @@ -461,24 +461,23 @@ template _CCCL_HOST_DEVICE deferred(_Arg) -> deferred<_Arg>; template -_CCCL_HOST_DEVICE deferred(_Arg, __static_bounds<_Lowest, _Highest>) - -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred(_Arg, static_bounds<_Lowest, _Highest>) -> deferred<_Arg, static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE deferred(_Arg, __runtime_bounds<_Tp>) -> deferred<_Arg>; +_CCCL_HOST_DEVICE deferred(_Arg, runtime_bounds<_Tp>) -> deferred<_Arg>; template -_CCCL_HOST_DEVICE deferred(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>) - -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred(_Arg, static_bounds<_Lowest, _Highest>, runtime_bounds<_Tp>) + -> deferred<_Arg, static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE deferred(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>) - -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Highest>) + -> deferred<_Arg, static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED //! @brief Wraps a reference to a sequence of values that is potentially not available at API call time but will be //! available by the time the argument is consumed in stream order. -template +template struct deferred_sequence : __deferred_base<_Arg, _StaticBounds> { using __deferred_base<_Arg, _StaticBounds>::__deferred_base; @@ -489,19 +488,19 @@ template _CCCL_HOST_DEVICE deferred_sequence(_Arg) -> deferred_sequence<_Arg>; template -_CCCL_HOST_DEVICE deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>) - -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred_sequence(_Arg, static_bounds<_Lowest, _Highest>) + -> deferred_sequence<_Arg, static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE deferred_sequence(_Arg, __runtime_bounds<_Tp>) -> deferred_sequence<_Arg>; +_CCCL_HOST_DEVICE deferred_sequence(_Arg, runtime_bounds<_Tp>) -> deferred_sequence<_Arg>; template -_CCCL_HOST_DEVICE deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>) - -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred_sequence(_Arg, static_bounds<_Lowest, _Highest>, runtime_bounds<_Tp>) + -> deferred_sequence<_Arg, static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE deferred_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>) - -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE deferred_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Highest>) + -> deferred_sequence<_Arg, static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED // ===================================================================== diff --git a/libcudacxx/include/cuda/__argument/argument_bounds.h b/libcudacxx/include/cuda/__argument/argument_bounds.h index 5ccaafb4589..630ae897910 100644 --- a/libcudacxx/include/cuda/__argument/argument_bounds.h +++ b/libcudacxx/include/cuda/__argument/argument_bounds.h @@ -31,7 +31,7 @@ _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT //! @brief Sentinel type indicating no bounds are present. -struct __no_bounds +struct no_bounds {}; // ===================================================================== @@ -45,7 +45,7 @@ struct __no_bounds //! @tparam _Lower The static lower bound. //! @tparam _Upper The static upper bound. template -struct __static_bounds +struct static_bounds { static_assert(::cuda::std::is_same_v, "Static bounds endpoints must have the same type"); @@ -64,7 +64,7 @@ struct __static_bounds template inline constexpr bool __is_static_bounds_v = false; template -inline constexpr bool __is_static_bounds_v<__static_bounds<_Lower, _Upper>> = true; +inline constexpr bool __is_static_bounds_v> = true; // ===================================================================== // runtime_bounds @@ -74,14 +74,14 @@ inline constexpr bool __is_static_bounds_v<__static_bounds<_Lower, _Upper>> = tr //! //! @tparam _Tp The value type of the bounds. template -struct __runtime_bounds +struct runtime_bounds { _Tp __lower_ = ::cuda::std::numeric_limits<_Tp>::lowest(); _Tp __upper_ = (::cuda::std::numeric_limits<_Tp>::max)(); - constexpr __runtime_bounds() noexcept = default; + constexpr runtime_bounds() noexcept = default; - _CCCL_API constexpr __runtime_bounds(_Tp __lower, _Tp __upper) noexcept + _CCCL_API constexpr runtime_bounds(_Tp __lower, _Tp __upper) noexcept : __lower_(__lower) , __upper_(__upper) { @@ -101,13 +101,13 @@ struct __runtime_bounds #ifndef _CCCL_DOXYGEN_INVOKED template -_CCCL_HOST_DEVICE __runtime_bounds(_Tp, _Tp) -> __runtime_bounds<_Tp>; +_CCCL_HOST_DEVICE runtime_bounds(_Tp, _Tp) -> runtime_bounds<_Tp>; #endif // _CCCL_DOXYGEN_INVOKED template inline constexpr bool __is_runtime_bounds_v = false; template -inline constexpr bool __is_runtime_bounds_v<__runtime_bounds<_Tp>> = true; +inline constexpr bool __is_runtime_bounds_v> = true; // ===================================================================== // bounds — factory functions @@ -119,7 +119,7 @@ inline constexpr bool __is_runtime_bounds_v<__runtime_bounds<_Tp>> = true; //! @tparam _Upper The static upper bound. //! @return A compile-time bounds object. template -[[nodiscard]] _CCCL_API constexpr __static_bounds<_Lower, _Upper> bounds() noexcept +[[nodiscard]] _CCCL_API constexpr static_bounds<_Lower, _Upper> bounds() noexcept { return {}; } @@ -130,7 +130,7 @@ template //! @param __upper The runtime upper bound. //! @return A runtime bounds object. template -[[nodiscard]] _CCCL_API constexpr __runtime_bounds<_Tp> bounds(_Tp __lower, _Tp __upper) noexcept +[[nodiscard]] _CCCL_API constexpr runtime_bounds<_Tp> bounds(_Tp __lower, _Tp __upper) noexcept { return {__lower, __upper}; } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp index 9a5717f32c3..4275c4aca9c 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp @@ -20,21 +20,21 @@ TEST_FUNC constexpr bool test() // Basic static bounds { - constexpr auto b = cuda::argument::__static_bounds<1, 4096>{}; + constexpr auto b = cuda::argument::static_bounds<1, 4096>{}; static_assert(b.lower() == 1); static_assert(b.upper() == 4096); } // Exact static bounds { - constexpr auto b = cuda::argument::__static_bounds<42, 42>{}; + constexpr auto b = cuda::argument::static_bounds<42, 42>{}; static_assert(b.lower() == 42); static_assert(b.upper() == 42); } // Long type deduced from NTTPs { - static_assert(cuda::std::is_same_v::lower()), long>); + static_assert(cuda::std::is_same_v::lower()), long>); } #if TEST_HAS_CLASS_NTTP @@ -52,7 +52,7 @@ TEST_FUNC constexpr bool test() // Basic runtime bounds { - auto b = cuda::argument::__runtime_bounds{10, 100}; + auto b = cuda::argument::runtime_bounds{10, 100}; assert(b.lower() == 10); assert(b.upper() == 100); assert(b.__lower_ == 10); @@ -86,10 +86,10 @@ TEST_FUNC constexpr bool test() // Static and runtime bounds intersection { - static_assert(cuda::argument::__has_bounds_intersection>( - cuda::argument::__runtime_bounds{50, 200})); - static_assert(!cuda::argument::__has_bounds_intersection>( - cuda::argument::__runtime_bounds{0, 50})); + static_assert(cuda::argument::__has_bounds_intersection>( + cuda::argument::runtime_bounds{50, 200})); + static_assert(!cuda::argument::__has_bounds_intersection>( + cuda::argument::runtime_bounds{0, 50})); } // Non-bounds type diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp index 50a6b51c932..632fe50945a 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp @@ -143,12 +143,12 @@ TEST_FUNC void test() static_assert(cuda::argument::__traits::lowest == cuda::std::numeric_limits::lowest()); static_assert(cuda::argument::__traits::highest == (cuda::std::numeric_limits::max)()); static_assert( - cuda::argument::__traits>>::lowest == 1); + cuda::argument::__traits>>::lowest == 1); static_assert( - cuda::argument::__traits>&>::highest == 8); + cuda::argument::__traits>&>::highest == 8); static_assert( cuda::argument::__traits< - cuda::argument::immediate_sequence, cuda::argument::__static_bounds<1, 8>>>::highest + cuda::argument::immediate_sequence, cuda::argument::static_bounds<1, 8>>>::highest == 8); #if TEST_HAS_CLASS_NTTP static_assert( diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp index f9bda950aa7..5dc2fb849c7 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp @@ -41,7 +41,7 @@ TEST_FUNC constexpr bool test() // Deferred single value via pointer { int val = 42; - using def_t = cuda::argument::deferred>; + using def_t = cuda::argument::deferred>; static_assert(cuda::argument::__traits::lowest == 0); static_assert(cuda::argument::__traits::highest == 100); // Also verify construction works diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp index 79a96a3366e..7f9902e50a2 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp @@ -10,7 +10,7 @@ #include -using arg_t = cuda::argument::immediate>; +using arg_t = cuda::argument::immediate>; [[maybe_unused]] constexpr auto invalid_highest = cuda::argument::__traits::highest; diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp index d3a0499fc55..8170fdd7ee5 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp @@ -10,7 +10,7 @@ #include -[[maybe_unused]] constexpr auto invalid_bounds = cuda::argument::__static_bounds<0, 1L>{}; +[[maybe_unused]] constexpr auto invalid_bounds = cuda::argument::static_bounds<0, 1L>{}; int main(int, char**) { From 2e084a921827f40709755e4e57084f8d8b7a3f80 Mon Sep 17 00:00:00 2001 From: Piotr Ciolkosz Date: Thu, 4 Jun 2026 14:04:59 -0700 Subject: [PATCH 3/5] Tighten bounds validation --- libcudacxx/include/cuda/__argument/argument.h | 29 +++++++++++++------ .../cuda/argument/argument_bounds.pass.cpp | 8 +++++ .../static_argument_bounds_type.fail.cpp | 20 +++++++++++++ 3 files changed, 48 insertions(+), 9 deletions(-) create mode 100644 libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp diff --git a/libcudacxx/include/cuda/__argument/argument.h b/libcudacxx/include/cuda/__argument/argument.h index 035e0cdcf2d..b955e1f428f 100644 --- a/libcudacxx/include/cuda/__argument/argument.h +++ b/libcudacxx/include/cuda/__argument/argument.h @@ -171,7 +171,10 @@ _CCCL_API constexpr bool __static_bound_in_range() noexcept } template -inline constexpr bool __valid_static_bounds_v = true; +inline constexpr bool __valid_static_bounds_v = false; + +template +inline constexpr bool __valid_static_bounds_v<_ElementType, no_bounds> = true; template inline constexpr bool __valid_static_bounds_v<_ElementType, static_bounds<_Lowest, _Highest>> = @@ -228,7 +231,8 @@ template _CCCL_API constexpr void __validate_bounds_intersection(runtime_bounds<_ElementType> __runtime_bounds) noexcept { static_assert(__valid_static_bounds_v<_ElementType, _StaticBounds>, - "static argument bounds cannot be represented by the element type"); + "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "values representable by the element type"); _CCCL_VERIFY((__has_bounds_intersection<_ElementType, _StaticBounds>(__runtime_bounds)), "static and runtime argument bounds do not intersect"); } @@ -266,7 +270,8 @@ struct immediate using __element_type = __element_type_of_t<_Arg>; static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>, - "static argument bounds cannot be represented by the element type"); + "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "values representable by the element type"); _Arg __arg_; @@ -313,7 +318,8 @@ struct immediate_sequence static_assert(__is_sequence_v<_Arg>, "immediate sequence arguments must have a distinct element type"); static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>, - "static argument bounds cannot be represented by the element type"); + "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "values representable by the element type"); _Arg __arg_; runtime_bounds<__element_type> __runtime_bounds_{}; @@ -407,7 +413,8 @@ struct __deferred_base using __element_type = __element_type_of_t<_Arg>; static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>, - "static argument bounds cannot be represented by the element type"); + "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "values representable by the element type"); _Arg __arg_; runtime_bounds<__element_type> __runtime_bounds_{}; @@ -693,7 +700,8 @@ struct __traits_impl> using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; static_assert(__valid_static_bounds_v, - "static argument bounds cannot be represented by the element type"); + "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "values representable by the element type"); static constexpr bool is_constant = false; static constexpr bool is_deferred = false; @@ -722,7 +730,8 @@ struct __traits_impl> using element_type = __element_type_of_t<_Arg>; static_assert(__is_sequence_v, "immediate sequence arguments must have a distinct element type"); static_assert(__valid_static_bounds_v, - "static argument bounds cannot be represented by the element type"); + "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "values representable by the element type"); static constexpr bool is_constant = false; static constexpr bool is_deferred = false; @@ -737,7 +746,8 @@ struct __traits_impl> using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; static_assert(__valid_static_bounds_v, - "static argument bounds cannot be represented by the element type"); + "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "values representable by the element type"); static constexpr bool is_constant = false; static constexpr bool is_deferred = true; @@ -752,7 +762,8 @@ struct __traits_impl> using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; static_assert(__valid_static_bounds_v, - "static argument bounds cannot be represented by the element type"); + "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "values representable by the element type"); static constexpr bool is_constant = false; static constexpr bool is_deferred = true; diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp index 4275c4aca9c..6be3c0d925b 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp @@ -97,6 +97,14 @@ TEST_FUNC constexpr bool test() static_assert(!cuda::argument::__is_bounds_v); } + // Bounds types accepted by argument wrapper template parameters + { + static_assert(cuda::argument::__valid_static_bounds_v); + static_assert(cuda::argument::__valid_static_bounds_v>); + static_assert(!cuda::argument::__valid_static_bounds_v>); + static_assert(!cuda::argument::__valid_static_bounds_v); + } + return true; } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp new file mode 100644 index 00000000000..66a1828769b --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using arg_t = cuda::argument::immediate>; + +[[maybe_unused]] arg_t invalid_arg{0}; + +int main(int, char**) +{ + return 0; +} From 2c740b7dfa2080377f6c51928e2432fe8f5e4413 Mon Sep 17 00:00:00 2001 From: Piotr Ciolkosz Date: Thu, 4 Jun 2026 14:12:53 -0700 Subject: [PATCH 4/5] Asssert deferred_sequence type is a sequence --- libcudacxx/include/cuda/__argument/argument.h | 3 +++ .../deferred_sequence_scalar.fail.cpp | 18 +++++++++++++++++ .../deferred_sequence_scalar_traits.fail.cpp | 20 +++++++++++++++++++ 3 files changed, 41 insertions(+) create mode 100644 libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp create mode 100644 libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp diff --git a/libcudacxx/include/cuda/__argument/argument.h b/libcudacxx/include/cuda/__argument/argument.h index b955e1f428f..8c1933bdc7b 100644 --- a/libcudacxx/include/cuda/__argument/argument.h +++ b/libcudacxx/include/cuda/__argument/argument.h @@ -487,6 +487,8 @@ _CCCL_HOST_DEVICE deferred(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Hi template struct deferred_sequence : __deferred_base<_Arg, _StaticBounds> { + static_assert(__is_sequence_v<_Arg>, "deferred sequence arguments must have a distinct element type"); + using __deferred_base<_Arg, _StaticBounds>::__deferred_base; }; @@ -761,6 +763,7 @@ struct __traits_impl> { using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; + static_assert(__is_sequence_v, "deferred sequence arguments must have a distinct element type"); static_assert(__valid_static_bounds_v, "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " "values representable by the element type"); diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp new file mode 100644 index 00000000000..1ce371b84e5 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp @@ -0,0 +1,18 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +[[maybe_unused]] cuda::argument::deferred_sequence invalid_arg{0}; + +int main(int, char**) +{ + return 0; +} diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp new file mode 100644 index 00000000000..41a9a2ae778 --- /dev/null +++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp @@ -0,0 +1,20 @@ +//===----------------------------------------------------------------------===// +// +// Part of libcu++, the C++ Standard Library for your entire system, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +#include + +using traits = cuda::argument::__traits>; + +[[maybe_unused]] constexpr bool invalid_traits = traits::is_deferred; + +int main(int, char**) +{ + return 0; +} From 9088870b74443d908b05c80e381d6e129b269b7d Mon Sep 17 00:00:00 2001 From: Piotr Ciolkosz Date: Fri, 5 Jun 2026 15:20:02 -0700 Subject: [PATCH 5/5] Feedback from review meeting --- .../bench/segmented_topk/fixed/keys.cu | 10 +- .../bench/segmented_topk/variable/keys.cu | 12 +- cub/cub/agent/agent_batched_topk.cuh | 10 +- cub/cub/detail/segmented_params.cuh | 34 +- .../device/dispatch/dispatch_batched_topk.cuh | 18 +- .../dispatch/kernels/kernel_batched_topk.cuh | 8 +- .../catch2_test_device_segmented_topk_keys.cu | 26 +- ...catch2_test_device_segmented_topk_pairs.cu | 18 +- libcudacxx/include/cuda/__argument/argument.h | 317 ++++++++++++------ .../include/cuda/__argument/argument_bounds.h | 6 +- .../include/cuda/std/__internal/namespaces.h | 4 +- .../cuda/argument/argument_bounds.pass.cpp | 52 ++- .../cuda/argument/argument_traits.pass.cpp | 165 +++++---- .../cuda/argument/deferred_argument.pass.cpp | 118 +++---- .../deferred_sequence_scalar.fail.cpp | 2 +- .../deferred_sequence_scalar_traits.fail.cpp | 2 +- .../cuda/argument/dynamic_argument.pass.cpp | 123 ++++--- .../cuda/argument/static_argument.pass.cpp | 85 +++-- .../static_argument_bounds_type.fail.cpp | 2 +- .../static_bounds_conversion.fail.cpp | 4 +- .../static_bounds_type_mismatch.fail.cpp | 2 +- .../cuda/argument/usage_example.pass.cpp | 71 ++-- 22 files changed, 610 insertions(+), 479 deletions(-) diff --git a/cub/benchmarks/bench/segmented_topk/fixed/keys.cu b/cub/benchmarks/bench/segmented_topk/fixed/keys.cu index 4178e7ea0d5..77bd997b742 100644 --- a/cub/benchmarks/bench/segmented_topk/fixed/keys.cu +++ b/cub/benchmarks/bench/segmented_topk/fixed/keys.cu @@ -51,7 +51,7 @@ void fixed_seg_size_topk_keys( const auto selected_elements = static_cast<::cuda::std::ptrdiff_t>(MaxNumSelected); const auto num_segments = ::cuda::std::max(1, (max_elements / segment_size)); const auto elements = num_segments * segment_size; - const auto total_num_items = ::cuda::argument::immediate{static_cast<::cuda::std::int64_t>(elements)}; + const auto total_num_items = ::cuda::args::immediate{static_cast<::cuda::std::int64_t>(elements)}; const bit_entropy entropy = str_to_entropy(state.get_string("Entropy")); // Skip workloads where k exceeds the segment size @@ -68,9 +68,9 @@ void fixed_seg_size_topk_keys( auto d_keys_in = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_in_ptr), segment_size); auto d_keys_out = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_out_ptr), selected_elements); - auto segment_sizes = ::cuda::argument::constant{}; - auto k = ::cuda::argument::constant{}; - auto select_direction = ::cuda::argument::constant{}; + auto segment_sizes = ::cuda::args::constant{}; + auto k = ::cuda::args::constant{}; + auto select_direction = ::cuda::args::constant{}; state.add_element_count(elements, "NumElements"); state.add_element_count(segment_size, "SegmentSize"); @@ -99,7 +99,7 @@ void fixed_seg_size_topk_keys( segment_sizes, k, select_direction, - ::cuda::argument::immediate{static_cast<::cuda::std::int64_t>(num_segments)}, + ::cuda::args::immediate{static_cast<::cuda::std::int64_t>(num_segments)}, total_num_items, env); }); diff --git a/cub/benchmarks/bench/segmented_topk/variable/keys.cu b/cub/benchmarks/bench/segmented_topk/variable/keys.cu index d14394afc39..4f7effc912a 100644 --- a/cub/benchmarks/bench/segmented_topk/variable/keys.cu +++ b/cub/benchmarks/bench/segmented_topk/variable/keys.cu @@ -172,17 +172,17 @@ void variable_seg_size_topk_keys(nvbench::state& state, static_cast(MaxSegmentSize)); const auto input_elements = thrust::reduce(d_segment_sizes.begin(), d_segment_sizes.end()); const auto output_elements = static_cast(num_segments) * K; - const auto total_num_items = ::cuda::argument::immediate{static_cast(input_elements)}; + const auto total_num_items = ::cuda::args::immediate{static_cast(input_elements)}; auto in_keys_buffer = gen_data( num_segments, string_to_pattern(state.get_string("Pattern")), thrust::raw_pointer_cast(d_segment_sizes.data())); auto out_keys_buffer = thrust::device_vector(output_elements, thrust::no_init); - auto segment_sizes_param = ::cuda::argument::immediate_sequence{ - thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::argument::bounds<1, MaxSegmentSize>()}; - auto k_param = ::cuda::argument::constant{}; - auto select_direction = ::cuda::argument::constant{}; - auto num_segments_param = ::cuda::argument::immediate{static_cast(num_segments)}; + auto segment_sizes_param = ::cuda::args::__immediate_sequence{ + thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::args::bounds<1, MaxSegmentSize>()}; + auto k_param = ::cuda::args::constant{}; + auto select_direction = ::cuda::args::constant{}; + auto num_segments_param = ::cuda::args::immediate{static_cast(num_segments)}; auto d_keys_in = cuda::make_strided_iterator( cuda::make_counting_iterator(thrust::raw_pointer_cast(in_keys_buffer.data())), diff --git a/cub/cub/agent/agent_batched_topk.cuh b/cub/cub/agent/agent_batched_topk.cuh index bc10311d36d..d2a99cfc809 100644 --- a/cub/cub/agent/agent_batched_topk.cuh +++ b/cub/cub/agent/agent_batched_topk.cuh @@ -73,8 +73,8 @@ struct agent_batched_topk_worker_per_segment using key_t = it_value_t; using value_t = it_value_t; - using segment_size_val_t = typename ::cuda::argument::__traits::element_type; - using num_segments_val_t = typename ::cuda::argument::__traits::element_type; + using segment_size_val_t = typename ::cuda::args::__traits::element_type; + using num_segments_val_t = typename ::cuda::args::__traits::element_type; using counters_t = batched_topk_counters; static constexpr auto policy = PolicyGetter{}(); @@ -95,7 +95,7 @@ struct agent_batched_topk_worker_per_segment multi_worker_per_segment_policy.threads_per_block * multi_worker_per_segment_policy.items_per_thread; // Check if there could be large segments present - static constexpr bool only_small_segments = ::cuda::argument::__traits::highest <= tile_size; + static constexpr bool only_small_segments = ::cuda::args::__traits::highest <= tile_size; // Check if we are dealing with keys-only or key-value pairs static constexpr bool is_keys_only = ::cuda::std::is_same_v; @@ -196,8 +196,8 @@ struct agent_batched_topk_worker_per_segment return; } - constexpr bool is_full_tile = ::cuda::argument::__traits::is_constant - && ::cuda::argument::__traits::lowest == tile_size; + constexpr bool is_full_tile = ::cuda::args::__traits::is_constant + && ::cuda::args::__traits::lowest == tile_size; // Resolve Segment Parameters const auto segment_size = params::get_param(segment_sizes, segment_id); diff --git a/cub/cub/detail/segmented_params.cuh b/cub/cub/detail/segmented_params.cuh index e9a999a87b1..624a5da1a23 100644 --- a/cub/cub/detail/segmented_params.cuh +++ b/cub/cub/detail/segmented_params.cuh @@ -33,10 +33,10 @@ namespace detail::params //! @param[in] __index Segment index to read for sequence arguments. //! @return The single argument value, or the sequence element at the given index. _CCCL_TEMPLATE(class _Tp, class _SegmentIndexT) -_CCCL_REQUIRES((!::cuda::argument::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) ) +_CCCL_REQUIRES((!::cuda::args::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) ) [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(_Tp&& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept { - if constexpr (::cuda::argument::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value) + if constexpr (::cuda::args::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value) { return __arg; } @@ -46,46 +46,46 @@ _CCCL_REQUIRES((!::cuda::argument::__is_wrapper_v<::cuda::std::remove_cvref_t<_T } } -template +template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto -get_param(const ::cuda::argument::constant<_Value>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept +get_param(const ::cuda::args::constant<_Value, _Tp>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept { - return ::cuda::argument::__unwrap(__arg); + return ::cuda::args::__unwrap(__arg); } template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto -get_param(const ::cuda::argument::constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept +get_param(const ::cuda::args::__constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept { - return ::cuda::argument::__unwrap(__arg)[__index]; + return ::cuda::args::__unwrap(__arg)[__index]; } template -[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param( - const ::cuda::argument::immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept +[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto +get_param(const ::cuda::args::immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept { - return ::cuda::argument::__unwrap(__arg); + return ::cuda::args::__unwrap(__arg); } template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto -get_param(const ::cuda::argument::immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept +get_param(const ::cuda::args::__immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept { - return ::cuda::argument::__unwrap(__arg)[__index]; + return ::cuda::args::__unwrap(__arg)[__index]; } template -[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param( - const ::cuda::argument::deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept +[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto +get_param(const ::cuda::args::deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept { - return ::cuda::argument::__unwrap(__arg); + return ::cuda::args::__unwrap(__arg); } template [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto -get_param(const ::cuda::argument::deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept +get_param(const ::cuda::args::deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept { - return ::cuda::argument::__unwrap(__arg)[__index]; + return ::cuda::args::__unwrap(__arg)[__index]; } // ===================================================================== diff --git a/cub/cub/device/dispatch/dispatch_batched_topk.cuh b/cub/cub/device/dispatch/dispatch_batched_topk.cuh index 68a3a4f26b0..b1151f0dc4a 100644 --- a/cub/cub/device/dispatch/dispatch_batched_topk.cuh +++ b/cub/cub/device/dispatch/dispatch_batched_topk.cuh @@ -50,8 +50,8 @@ namespace detail::batched_topk // ----------------------------------------------------------------------------- // Uniform (compile-time): constant -> single-option uniform_discrete_param. -template -[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::argument::constant) +template +[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::args::constant) { return params::uniform_discrete_param{Dir}; } @@ -126,7 +126,7 @@ template >, it_value_t>, ::cuda::std::int64_t, - ::cuda::argument::__traits::highest>> + ::cuda::args::__traits::highest>> #if _CCCL_HAS_CONCEPTS() requires batched_topk_policy_selector #endif // _CCCL_HAS_CONCEPTS() @@ -145,7 +145,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( cudaStream_t stream = nullptr, [[maybe_unused]] PolicySelector policy_selector = {}) { - using large_segment_tile_offset_t = typename ::cuda::argument::__traits::element_type; + using large_segment_tile_offset_t = typename ::cuda::args::__traits::element_type; // Wrap the raw enum into the internal discrete param type auto select_directions = wrap_select_direction(select_direction); @@ -171,9 +171,9 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( static constexpr int worker_per_segment_tile_size = worker_per_segment_policy.threads_per_block * worker_per_segment_policy.items_per_thread; static constexpr bool any_small_segments = - ::cuda::argument::__traits::lowest <= worker_per_segment_tile_size; + ::cuda::args::__traits::lowest <= worker_per_segment_tile_size; static constexpr bool only_small_segments = - ::cuda::argument::__traits::highest <= worker_per_segment_tile_size; + ::cuda::args::__traits::highest <= worker_per_segment_tile_size; // Allocation layout: // only_small_segments: [0] dummy. @@ -183,7 +183,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( static constexpr int allocations_array_size = only_small_segments ? 1 : (any_small_segments ? 3 : 2); size_t allocation_sizes[allocations_array_size] = {1}; - using num_segments_val_t = typename ::cuda::argument::__traits::element_type; + using num_segments_val_t = typename ::cuda::args::__traits::element_type; using counters_t = batched_topk_counters; using segment_size_scan_offset_t = detail::choose_offset_t; using segment_size_scan_input_op_t = @@ -239,7 +239,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch( // TODO (elstehle): support number of segments provided by device-accessible iterator // Only uniform number of segments are supported (i.e., we need to resolve the number of segments on the host) - static_assert(::cuda::argument::__traits::is_single_value, + static_assert(::cuda::args::__traits::is_single_value, "Only uniform segment sizes are currently supported."); if constexpr (any_small_segments) @@ -341,7 +341,7 @@ template >, it_value_t>, ::cuda::std::int64_t, - ::cuda::argument::__traits::highest>; + ::cuda::args::__traits::highest>; return detail::dispatch_with_env_and_tuning( env, [&](auto policy_selector, void* d_temp_storage, size_t& temp_storage_bytes, cudaStream_t stream) { return dispatch( diff --git a/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh b/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh index 9c904f95971..3412f40359a 100644 --- a/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh +++ b/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh @@ -39,7 +39,7 @@ private: worker_policy worker_per_segment_policy; multi_worker_policy multi_worker_per_segment_policy; }; - static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::argument::__traits::highest; + static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::args::__traits::highest; static constexpr batched_topk_policy active_policy = current_policy(); template @@ -133,8 +133,8 @@ __launch_bounds__(int( KParameterT k, SelectDirectionParameterT select_directions, NumSegmentsParameterT num_segments, - batched_topk_counters::element_type>* d_counters, - typename ::cuda::argument::__traits::element_type* d_large_segments_ids, + batched_topk_counters::element_type>* d_counters, + typename ::cuda::args::__traits::element_type* d_large_segments_ids, LargeSegmentTileOffsetT* d_large_segments_tile_offsets) { using agent_t = typename find_smallest_covering_policy< @@ -151,7 +151,7 @@ __launch_bounds__(int( LargeSegmentTileOffsetT>::agent_t; // Static Assertions (Constraints) - static_assert(agent_t::tile_size >= ::cuda::argument::__traits::highest, + static_assert(agent_t::tile_size >= ::cuda::args::__traits::highest, "Block size exceeds maximum segment size supported by SegmentSizeParameterT"); static_assert(sizeof(typename agent_t::TempStorage) <= max_smem_per_block, "Static shared memory per block must not exceed 48KB limit."); diff --git a/cub/test/catch2_test_device_segmented_topk_keys.cu b/cub/test/catch2_test_device_segmented_topk_keys.cu index e16d5637f19..433e6b1a35a 100644 --- a/cub/test/catch2_test_device_segmented_topk_keys.cu +++ b/cub/test/catch2_test_device_segmented_topk_keys.cu @@ -151,11 +151,11 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small fixed-size segments", batched_topk_keys( d_keys_in, d_keys_out, - ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds()}, - ::cuda::argument::immediate{k, ::cuda::argument::bounds()}, + ::cuda::args::immediate{segment_size, ::cuda::args::bounds()}, + ::cuda::args::immediate{k, ::cuda::args::bounds()}, direction, - ::cuda::argument::immediate{num_segments}, - ::cuda::argument::immediate{num_segments * segment_size}); + ::cuda::args::immediate{num_segments}, + ::cuda::args::immediate{num_segments * segment_size}); // Prepare expected results fixed_size_segmented_sort_keys(expected_keys, num_segments, segment_size, direction); compact_sorted_keys_to_topk(expected_keys, segment_size, k); @@ -248,12 +248,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small variable-size segment batched_topk_keys( d_keys_in, d_keys_out, - ::cuda::argument::immediate_sequence{ - segment_size_it, ::cuda::argument::bounds()}, - ::cuda::argument::immediate{k, ::cuda::argument::bounds()}, + ::cuda::args::__immediate_sequence{ + segment_size_it, ::cuda::args::bounds()}, + ::cuda::args::immediate{k, ::cuda::args::bounds()}, direction, - ::cuda::argument::immediate{num_segments}, - ::cuda::argument::immediate{num_items}); + ::cuda::args::immediate{num_segments}, + ::cuda::args::immediate{num_items}); // Verify keys are returned correctly: sort each segment of the expected input, then compact the top-k segmented_sort_keys(expected_keys, num_segments, segment_offsets.cbegin(), segment_offsets.cbegin() + 1, direction); @@ -286,11 +286,11 @@ C2H_TEST("DeviceBatchedTopK::MinKeys preserves -0.0f in output", "[keys][segment batched_topk_keys( d_keys_in_it, d_keys_out_it, - ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds()}, - ::cuda::argument::immediate{k, ::cuda::argument::bounds()}, + ::cuda::args::immediate{segment_size, ::cuda::args::bounds()}, + ::cuda::args::immediate{k, ::cuda::args::bounds()}, cub::detail::topk::select::min, - ::cuda::argument::immediate{num_segments}, - ::cuda::argument::immediate{num_segments * segment_size}); + ::cuda::args::immediate{num_segments}, + ::cuda::args::immediate{num_segments * segment_size}); const int num_minus_zero = static_cast(thrust::count_if(d_keys_out.begin(), d_keys_out.end(), is_minus_zero{})); REQUIRE(num_minus_zero >= 1); diff --git a/cub/test/catch2_test_device_segmented_topk_pairs.cu b/cub/test/catch2_test_device_segmented_topk_pairs.cu index eb2825f15d2..e7041e445c6 100644 --- a/cub/test/catch2_test_device_segmented_topk_pairs.cu +++ b/cub/test/catch2_test_device_segmented_topk_pairs.cu @@ -220,11 +220,11 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small fixed-size segments" d_keys_out, d_values_in, d_values_out, - ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds()}, - ::cuda::argument::immediate{k, ::cuda::argument::bounds()}, + ::cuda::args::immediate{segment_size, ::cuda::args::bounds()}, + ::cuda::args::immediate{k, ::cuda::args::bounds()}, direction, - ::cuda::argument::immediate{num_segments}, - ::cuda::argument::immediate{num_segments * segment_size}); + ::cuda::args::immediate{num_segments}, + ::cuda::args::immediate{num_segments * segment_size}); // Verification: // - We verify correct top-k selection through the keys @@ -340,12 +340,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small variable-size segmen d_keys_out, d_values_in, d_values_out, - ::cuda::argument::immediate_sequence{ - segment_size_it, ::cuda::argument::bounds()}, - ::cuda::argument::immediate{k, ::cuda::argument::bounds()}, + ::cuda::args::__immediate_sequence{ + segment_size_it, ::cuda::args::bounds()}, + ::cuda::args::immediate{k, ::cuda::args::bounds()}, direction, - ::cuda::argument::immediate{num_segments}, - ::cuda::argument::immediate{num_items}); + ::cuda::args::immediate{num_segments}, + ::cuda::args::immediate{num_items}); // Verification: // - We verify correct top-k selection through the keys diff --git a/libcudacxx/include/cuda/__argument/argument.h b/libcudacxx/include/cuda/__argument/argument.h index 8c1933bdc7b..5e9d14a3bf5 100644 --- a/libcudacxx/include/cuda/__argument/argument.h +++ b/libcudacxx/include/cuda/__argument/argument.h @@ -44,6 +44,8 @@ _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT +struct __access; + // ===================================================================== // __element_type_of // ===================================================================== @@ -102,31 +104,28 @@ inline constexpr bool __is_iterable_v<_Tp, // spelling carries that intent. //! @brief Wraps a compile-time constant argument value. -template -struct constant +template +class constant { - using value_type = ::cuda::std::remove_cvref_t; +public: + using value_type = ::cuda::std::remove_cvref_t<_Tp>; using __element_type = value_type; - [[nodiscard]] _CCCL_API static constexpr value_type value() noexcept + [[nodiscard]] _CCCL_API static constexpr value_type __get_value() noexcept { - return _Value; + return static_cast(_Value); } }; //! @brief Wraps a compile-time constant argument sequence. template -struct constant_sequence +class __constant_sequence { +public: using value_type = ::cuda::std::remove_cvref_t; using __element_type = __element_type_of_t; static_assert(__is_sequence_v, "constant sequence arguments must have a distinct element type"); - - [[nodiscard]] _CCCL_API static constexpr value_type value() noexcept - { - return _Value; - } }; // ===================================================================== @@ -231,7 +230,7 @@ template _CCCL_API constexpr void __validate_bounds_intersection(runtime_bounds<_ElementType> __runtime_bounds) noexcept { static_assert(__valid_static_bounds_v<_ElementType, _StaticBounds>, - "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " "values representable by the element type"); _CCCL_VERIFY((__has_bounds_intersection<_ElementType, _StaticBounds>(__runtime_bounds)), "static and runtime argument bounds do not intersect"); @@ -265,17 +264,20 @@ _CCCL_API constexpr void __validate_runtime_element_bounds( //! //! The value is host-accessible at API call time. template -struct immediate +class immediate { +public: using __element_type = __element_type_of_t<_Arg>; static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>, - "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " "values representable by the element type"); +private: + friend struct __access; + _Arg __arg_; -private: _CCCL_API constexpr void __validate_value() const noexcept { if constexpr (::cuda::std::is_same_v<::cuda::std::remove_cvref_t<_Arg>, __element_type> @@ -307,24 +309,27 @@ _CCCL_HOST_DEVICE immediate(_Arg, static_bounds<_Lowest, _Highest>) #endif // _CCCL_DOXYGEN_INVOKED // ===================================================================== -// immediate_sequence +// __immediate_sequence // ===================================================================== //! @brief Wraps a runtime argument sequence with optional bounds. template -struct immediate_sequence +class __immediate_sequence { +public: using __element_type = __element_type_of_t<_Arg>; static_assert(__is_sequence_v<_Arg>, "immediate sequence arguments must have a distinct element type"); static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>, - "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " "values representable by the element type"); +private: + friend struct __access; + _Arg __arg_; runtime_bounds<__element_type> __runtime_bounds_{}; -private: _CCCL_API constexpr void __validate_bounds() const noexcept { __validate_bounds_intersection<__element_type, _StaticBounds>(__runtime_bounds_); @@ -348,14 +353,14 @@ struct immediate_sequence } public: - _CCCL_API constexpr immediate_sequence(_Arg __arg) noexcept + _CCCL_API constexpr __immediate_sequence(_Arg __arg) noexcept : __arg_{::cuda::std::move(__arg)} { __validate_bounds(); __validate_value(); } - _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds) noexcept + _CCCL_API constexpr __immediate_sequence(_Arg __arg, _StaticBounds) noexcept : __arg_{::cuda::std::move(__arg)} { __validate_bounds(); @@ -363,7 +368,7 @@ struct immediate_sequence } template - _CCCL_API constexpr immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb) noexcept + _CCCL_API constexpr __immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb) noexcept : __arg_{::cuda::std::move(__arg)} , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()), __runtime_bound_cast<__element_type>(__rb.upper())} @@ -373,7 +378,7 @@ struct immediate_sequence } template - _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds, runtime_bounds<_BoundsTp> __rb) noexcept + _CCCL_API constexpr __immediate_sequence(_Arg __arg, _StaticBounds, runtime_bounds<_BoundsTp> __rb) noexcept : __arg_{::cuda::std::move(__arg)} , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()), __runtime_bound_cast<__element_type>(__rb.upper())} @@ -383,23 +388,23 @@ struct immediate_sequence } template - _CCCL_API constexpr immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept - : immediate_sequence(::cuda::std::move(__arg), __sb, __rb) + _CCCL_API constexpr __immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept + : __immediate_sequence(::cuda::std::move(__arg), __sb, __rb) {} }; #ifndef _CCCL_DOXYGEN_INVOKED template -_CCCL_HOST_DEVICE immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>) - -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE __immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>) + -> __immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>, runtime_bounds<_Tp>) - -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE __immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>, runtime_bounds<_Tp>) + -> __immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>; template -_CCCL_HOST_DEVICE immediate_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Highest>) - -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>; +_CCCL_HOST_DEVICE __immediate_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Highest>) + -> __immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED // ===================================================================== @@ -408,17 +413,22 @@ _CCCL_HOST_DEVICE immediate_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_L //! @brief Common base for deferred argument wrappers. template -struct __deferred_base +class __deferred_base { +public: using __element_type = __element_type_of_t<_Arg>; static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>, - "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " "values representable by the element type"); +private: + friend struct __access; + _Arg __arg_; runtime_bounds<__element_type> __runtime_bounds_{}; +public: _CCCL_API constexpr __deferred_base(_Arg __arg) noexcept : __arg_{::cuda::std::move(__arg)} { @@ -458,8 +468,9 @@ struct __deferred_base //! @brief Wraps a reference to a single value that is potentially not available at API call time but will be available //! by the time the argument is consumed in stream order. template -struct deferred : __deferred_base<_Arg, _StaticBounds> +class deferred : public __deferred_base<_Arg, _StaticBounds> { +public: using __deferred_base<_Arg, _StaticBounds>::__deferred_base; }; @@ -485,8 +496,9 @@ _CCCL_HOST_DEVICE deferred(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Hi //! @brief Wraps a reference to a sequence of values that is potentially not available at API call time but will be //! available by the time the argument is consumed in stream order. template -struct deferred_sequence : __deferred_base<_Arg, _StaticBounds> +class deferred_sequence : public __deferred_base<_Arg, _StaticBounds> { +public: static_assert(__is_sequence_v<_Arg>, "deferred sequence arguments must have a distinct element type"); using __deferred_base<_Arg, _StaticBounds>::__deferred_base; @@ -512,6 +524,97 @@ _CCCL_HOST_DEVICE deferred_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_Lo -> deferred_sequence<_Arg, static_bounds<_Lowest, _Highest>>; #endif // _CCCL_DOXYGEN_INVOKED +// ===================================================================== +// __access +// ===================================================================== + +struct __access +{ + template + [[nodiscard]] _CCCL_API static constexpr _Arg& __arg(immediate<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__arg_; + } + + template + [[nodiscard]] _CCCL_API static constexpr const _Arg& __arg(const immediate<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__arg_; + } + + template + [[nodiscard]] _CCCL_API static constexpr _Arg&& __arg(immediate<_Arg, _StaticBounds>&& __wrapper) noexcept + { + return ::cuda::std::move(__wrapper.__arg_); + } + + template + [[nodiscard]] _CCCL_API static constexpr _Arg& __arg(__immediate_sequence<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__arg_; + } + + template + [[nodiscard]] _CCCL_API static constexpr const _Arg& + __arg(const __immediate_sequence<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__arg_; + } + + template + [[nodiscard]] _CCCL_API static constexpr _Arg&& __arg(__immediate_sequence<_Arg, _StaticBounds>&& __wrapper) noexcept + { + return ::cuda::std::move(__wrapper.__arg_); + } + + template + [[nodiscard]] _CCCL_API static constexpr _Arg& __arg(__deferred_base<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__arg_; + } + + template + [[nodiscard]] _CCCL_API static constexpr const _Arg& + __arg(const __deferred_base<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__arg_; + } + + template + [[nodiscard]] _CCCL_API static constexpr _Arg&& __arg(__deferred_base<_Arg, _StaticBounds>&& __wrapper) noexcept + { + return ::cuda::std::move(__wrapper.__arg_); + } + + template + [[nodiscard]] _CCCL_API static constexpr runtime_bounds<__element_type_of_t<_Arg>>& + __runtime_bounds(__immediate_sequence<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__runtime_bounds_; + } + + template + [[nodiscard]] _CCCL_API static constexpr const runtime_bounds<__element_type_of_t<_Arg>>& + __runtime_bounds(const __immediate_sequence<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__runtime_bounds_; + } + + template + [[nodiscard]] _CCCL_API static constexpr runtime_bounds<__element_type_of_t<_Arg>>& + __runtime_bounds(__deferred_base<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__runtime_bounds_; + } + + template + [[nodiscard]] _CCCL_API static constexpr const runtime_bounds<__element_type_of_t<_Arg>>& + __runtime_bounds(const __deferred_base<_Arg, _StaticBounds>& __wrapper) noexcept + { + return __wrapper.__runtime_bounds_; + } +}; + // ===================================================================== // __unwrap // ===================================================================== @@ -520,12 +623,12 @@ template inline constexpr bool __is_wrapper_v = false; template inline constexpr bool __is_wrapper_v> = true; +template +inline constexpr bool __is_wrapper_v> = true; template -inline constexpr bool __is_wrapper_v> = true; -template -inline constexpr bool __is_wrapper_v> = true; +inline constexpr bool __is_wrapper_v<__constant_sequence<_Value>> = true; template -inline constexpr bool __is_wrapper_v> = true; +inline constexpr bool __is_wrapper_v<__immediate_sequence<_Arg, _StaticBounds>> = true; template inline constexpr bool __is_wrapper_v> = true; template @@ -541,99 +644,99 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) ) template [[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(immediate<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template [[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const immediate<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template [[nodiscard]] _CCCL_API constexpr _Arg __unwrap(immediate<_Arg, _StaticBounds>&& __arg) noexcept { - return ::cuda::std::move(__arg.__arg_); + return __access::__arg(::cuda::std::move(__arg)); } -template -[[nodiscard]] _CCCL_API constexpr ::cuda::std::remove_cvref_t -__unwrap(const constant<_Value>&) noexcept +template +[[nodiscard]] _CCCL_API constexpr typename constant<_Value, _Tp>::value_type +__unwrap(const constant<_Value, _Tp>&) noexcept { - return _Value; + return constant<_Value, _Tp>::__get_value(); } template [[nodiscard]] _CCCL_API constexpr ::cuda::std::remove_cvref_t -__unwrap(const constant_sequence<_Value>&) noexcept +__unwrap(const __constant_sequence<_Value>&) noexcept { return _Value; } template -[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template -[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template -[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(immediate_sequence<_Arg, _StaticBounds>&& __arg) noexcept +[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__immediate_sequence<_Arg, _StaticBounds>&& __arg) noexcept { - return ::cuda::std::move(__arg.__arg_); + return __access::__arg(::cuda::std::move(__arg)); } template [[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(deferred<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template [[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const deferred<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template [[nodiscard]] _CCCL_API constexpr _Arg __unwrap(deferred<_Arg, _StaticBounds>&& __arg) noexcept { - return ::cuda::std::move(__arg.__arg_); + return __access::__arg(::cuda::std::move(__arg)); } template [[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template [[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template [[nodiscard]] _CCCL_API constexpr _Arg __unwrap(deferred_sequence<_Arg, _StaticBounds>&& __arg) noexcept { - return ::cuda::std::move(__arg.__arg_); + return __access::__arg(::cuda::std::move(__arg)); } -template +template _CCCL_API constexpr auto __constant_compute_lowest() noexcept { - return _Value; + return constant<_Value, _Tp>::__get_value(); } -template +template _CCCL_API constexpr auto __constant_compute_highest() noexcept { - return _Value; + return constant<_Value, _Tp>::__get_value(); } template @@ -684,16 +787,16 @@ struct __traits_impl static constexpr element_type highest = (::cuda::std::numeric_limits::max)(); }; -template -struct __traits_impl> +template +struct __traits_impl> { - using value_type = ::cuda::std::remove_cvref_t; + using value_type = typename constant<_Value, _Tp>::value_type; using element_type = value_type; static constexpr bool is_constant = true; static constexpr bool is_deferred = false; static constexpr bool is_single_value = true; - static constexpr element_type lowest = __constant_compute_lowest<_Value>(); - static constexpr element_type highest = __constant_compute_highest<_Value>(); + static constexpr element_type lowest = __constant_compute_lowest<_Value, _Tp>(); + static constexpr element_type highest = __constant_compute_highest<_Value, _Tp>(); }; template @@ -702,7 +805,7 @@ struct __traits_impl> using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; static_assert(__valid_static_bounds_v, - "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " "values representable by the element type"); static constexpr bool is_constant = false; @@ -713,7 +816,7 @@ struct __traits_impl> }; template -struct __traits_impl> +struct __traits_impl<__constant_sequence<_Value>> { using value_type = ::cuda::std::remove_cvref_t; using element_type = __element_type_of_t; @@ -726,13 +829,13 @@ struct __traits_impl> }; template -struct __traits_impl> +struct __traits_impl<__immediate_sequence<_Arg, _StaticBounds>> { using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; static_assert(__is_sequence_v, "immediate sequence arguments must have a distinct element type"); static_assert(__valid_static_bounds_v, - "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " "values representable by the element type"); static constexpr bool is_constant = false; @@ -748,7 +851,7 @@ struct __traits_impl> using value_type = _Arg; using element_type = __element_type_of_t<_Arg>; static_assert(__valid_static_bounds_v, - "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " "values representable by the element type"); static constexpr bool is_constant = false; @@ -765,7 +868,7 @@ struct __traits_impl> using element_type = __element_type_of_t<_Arg>; static_assert(__is_sequence_v, "deferred sequence arguments must have a distinct element type"); static_assert(__valid_static_bounds_v, - "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with " + "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with " "values representable by the element type"); static constexpr bool is_constant = false; @@ -791,14 +894,14 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cv_t<_Tp>>) ) return ::cuda::std::numeric_limits<__element_type_of_t<_Tp>>::lowest(); } -template -[[nodiscard]] _CCCL_API constexpr auto __lowest_(constant<_Value>) noexcept +template +[[nodiscard]] _CCCL_API constexpr auto __lowest_(constant<_Value, _Tp>) noexcept { - return __constant_compute_lowest<_Value>(); + return __constant_compute_lowest<_Value, _Tp>(); } template -[[nodiscard]] _CCCL_API constexpr auto __lowest_(constant_sequence<_Value>) noexcept +[[nodiscard]] _CCCL_API constexpr auto __lowest_(__constant_sequence<_Value>) noexcept { return __constant_sequence_compute_lowest<_Value>(); } @@ -806,31 +909,34 @@ template template [[nodiscard]] _CCCL_API constexpr auto __lowest_(immediate<_Arg, _StaticBounds> __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template -[[nodiscard]] _CCCL_API constexpr auto __lowest_(immediate_sequence<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __lowest_(__immediate_sequence<_Arg, _StaticBounds> __arg) noexcept { - using _ET = __element_type_of_t<_Arg>; - __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); - return __effective_lowest<_ET, _StaticBounds>(__arg.__runtime_bounds_); + using _ET = __element_type_of_t<_Arg>; + const auto& __runtime_bounds = __access::__runtime_bounds(__arg); + __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds); + return __effective_lowest<_ET, _StaticBounds>(__runtime_bounds); } template [[nodiscard]] _CCCL_API constexpr auto __lowest_(deferred<_Arg, _StaticBounds> __arg) noexcept { - using _ET = __element_type_of_t<_Arg>; - __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); - return __effective_lowest<_ET, _StaticBounds>(__arg.__runtime_bounds_); + using _ET = __element_type_of_t<_Arg>; + const auto& __runtime_bounds = __access::__runtime_bounds(__arg); + __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds); + return __effective_lowest<_ET, _StaticBounds>(__runtime_bounds); } template [[nodiscard]] _CCCL_API constexpr auto __lowest_(deferred_sequence<_Arg, _StaticBounds> __arg) noexcept { - using _ET = __element_type_of_t<_Arg>; - __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); - return __effective_lowest<_ET, _StaticBounds>(__arg.__runtime_bounds_); + using _ET = __element_type_of_t<_Arg>; + const auto& __runtime_bounds = __access::__runtime_bounds(__arg); + __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds); + return __effective_lowest<_ET, _StaticBounds>(__runtime_bounds); } //! @brief Returns the effective highest bound, combining static and runtime bounds. @@ -841,14 +947,14 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cv_t<_Tp>>) ) return (::cuda::std::numeric_limits<__element_type_of_t<_Tp>>::max)(); } -template -[[nodiscard]] _CCCL_API constexpr auto __highest_(constant<_Value>) noexcept +template +[[nodiscard]] _CCCL_API constexpr auto __highest_(constant<_Value, _Tp>) noexcept { - return __constant_compute_highest<_Value>(); + return __constant_compute_highest<_Value, _Tp>(); } template -[[nodiscard]] _CCCL_API constexpr auto __highest_(constant_sequence<_Value>) noexcept +[[nodiscard]] _CCCL_API constexpr auto __highest_(__constant_sequence<_Value>) noexcept { return __constant_sequence_compute_highest<_Value>(); } @@ -856,31 +962,34 @@ template template [[nodiscard]] _CCCL_API constexpr auto __highest_(immediate<_Arg, _StaticBounds> __arg) noexcept { - return __arg.__arg_; + return __access::__arg(__arg); } template -[[nodiscard]] _CCCL_API constexpr auto __highest_(immediate_sequence<_Arg, _StaticBounds> __arg) noexcept +[[nodiscard]] _CCCL_API constexpr auto __highest_(__immediate_sequence<_Arg, _StaticBounds> __arg) noexcept { - using _ET = __element_type_of_t<_Arg>; - __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); - return __effective_highest<_ET, _StaticBounds>(__arg.__runtime_bounds_); + using _ET = __element_type_of_t<_Arg>; + const auto& __runtime_bounds = __access::__runtime_bounds(__arg); + __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds); + return __effective_highest<_ET, _StaticBounds>(__runtime_bounds); } template [[nodiscard]] _CCCL_API constexpr auto __highest_(deferred<_Arg, _StaticBounds> __arg) noexcept { - using _ET = __element_type_of_t<_Arg>; - __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); - return __effective_highest<_ET, _StaticBounds>(__arg.__runtime_bounds_); + using _ET = __element_type_of_t<_Arg>; + const auto& __runtime_bounds = __access::__runtime_bounds(__arg); + __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds); + return __effective_highest<_ET, _StaticBounds>(__runtime_bounds); } template [[nodiscard]] _CCCL_API constexpr auto __highest_(deferred_sequence<_Arg, _StaticBounds> __arg) noexcept { - using _ET = __element_type_of_t<_Arg>; - __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_); - return __effective_highest<_ET, _StaticBounds>(__arg.__runtime_bounds_); + using _ET = __element_type_of_t<_Arg>; + const auto& __runtime_bounds = __access::__runtime_bounds(__arg); + __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds); + return __effective_highest<_ET, _StaticBounds>(__runtime_bounds); } _CCCL_END_NAMESPACE_CUDA_ARGUMENT diff --git a/libcudacxx/include/cuda/__argument/argument_bounds.h b/libcudacxx/include/cuda/__argument/argument_bounds.h index 630ae897910..4f5429936dc 100644 --- a/libcudacxx/include/cuda/__argument/argument_bounds.h +++ b/libcudacxx/include/cuda/__argument/argument_bounds.h @@ -45,8 +45,9 @@ struct no_bounds //! @tparam _Lower The static lower bound. //! @tparam _Upper The static upper bound. template -struct static_bounds +class static_bounds { +public: static_assert(::cuda::std::is_same_v, "Static bounds endpoints must have the same type"); static_assert(_Lower <= _Upper, "Lower bound must be <= upper bound"); @@ -74,11 +75,12 @@ inline constexpr bool __is_static_bounds_v> = true //! //! @tparam _Tp The value type of the bounds. template -struct runtime_bounds +class runtime_bounds { _Tp __lower_ = ::cuda::std::numeric_limits<_Tp>::lowest(); _Tp __upper_ = (::cuda::std::numeric_limits<_Tp>::max)(); +public: constexpr runtime_bounds() noexcept = default; _CCCL_API constexpr runtime_bounds(_Tp __lower, _Tp __upper) noexcept diff --git a/libcudacxx/include/cuda/std/__internal/namespaces.h b/libcudacxx/include/cuda/std/__internal/namespaces.h index f3cc191adfc..4876dd6f449 100644 --- a/libcudacxx/include/cuda/std/__internal/namespaces.h +++ b/libcudacxx/include/cuda/std/__internal/namespaces.h @@ -115,8 +115,8 @@ #define _CCCL_BEGIN_NAMESPACE_CUDA_EXECUTION _CCCL_BEGIN_NAMESPACE(cuda::execution) #define _CCCL_END_NAMESPACE_CUDA_EXECUTION _CCCL_END_NAMESPACE(cuda::execution) -#define _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT _CCCL_BEGIN_NAMESPACE(cuda::argument) -#define _CCCL_END_NAMESPACE_CUDA_ARGUMENT _CCCL_END_NAMESPACE(cuda::argument) +#define _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT _CCCL_BEGIN_NAMESPACE(cuda::args) +#define _CCCL_END_NAMESPACE_CUDA_ARGUMENT _CCCL_END_NAMESPACE(cuda::args) // Namespace to avoid name collisions with CPOs on clang-16 (see // https://godbolt.org/z/9TadonrdM for example). MSVC's ancient parser also gets confused with diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp index 6be3c0d925b..46070dbabbd 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp @@ -16,31 +16,31 @@ TEST_FUNC constexpr bool test() { - // --- static_argument_bounds --- + // --- static_bounds --- // Basic static bounds { - constexpr auto b = cuda::argument::static_bounds<1, 4096>{}; + constexpr auto b = cuda::args::static_bounds<1, 4096>{}; static_assert(b.lower() == 1); static_assert(b.upper() == 4096); } // Exact static bounds { - constexpr auto b = cuda::argument::static_bounds<42, 42>{}; + constexpr auto b = cuda::args::static_bounds<42, 42>{}; static_assert(b.lower() == 42); static_assert(b.upper() == 42); } // Long type deduced from NTTPs { - static_assert(cuda::std::is_same_v::lower()), long>); + static_assert(cuda::std::is_same_v::lower()), long>); } #if TEST_HAS_CLASS_NTTP // Static bounds preserve their original NTTP types { - constexpr auto b = cuda::argument::bounds<1.0f, 8.0f>(); + constexpr auto b = cuda::args::bounds<1.0f, 8.0f>(); static_assert(b.lower() == 1.0f); static_assert(b.upper() == 8); static_assert(cuda::std::is_same_v); @@ -48,17 +48,13 @@ TEST_FUNC constexpr bool test() } #endif // TEST_HAS_CLASS_NTTP - // --- runtime_argument_bounds --- + // --- runtime_bounds --- // Basic runtime bounds { - auto b = cuda::argument::runtime_bounds{10, 100}; + auto b = cuda::args::runtime_bounds{10, 100}; assert(b.lower() == 10); assert(b.upper() == 100); - assert(b.__lower_ == 10); - assert(b.__upper_ == 100); - b.__upper_ = 90; - assert(b.upper() == 90); static_assert(cuda::std::is_same_v); } @@ -66,43 +62,43 @@ TEST_FUNC constexpr bool test() // Static via factory { - constexpr auto b = cuda::argument::bounds<1, 8>(); + constexpr auto b = cuda::args::bounds<1, 8>(); static_assert(b.lower() == 1); static_assert(b.upper() == 8); - static_assert(cuda::argument::__is_static_bounds_cv_v); - static_assert(!cuda::argument::__is_runtime_bounds_cv_v); - static_assert(cuda::argument::__is_bounds_v); + static_assert(cuda::args::__is_static_bounds_cv_v); + static_assert(!cuda::args::__is_runtime_bounds_cv_v); + static_assert(cuda::args::__is_bounds_v); } // Runtime via factory { - auto b = cuda::argument::bounds(10, 100); + auto b = cuda::args::bounds(10, 100); assert(b.lower() == 10); assert(b.upper() == 100); - static_assert(!cuda::argument::__is_static_bounds_cv_v); - static_assert(cuda::argument::__is_runtime_bounds_cv_v); - static_assert(cuda::argument::__is_bounds_v); + static_assert(!cuda::args::__is_static_bounds_cv_v); + static_assert(cuda::args::__is_runtime_bounds_cv_v); + static_assert(cuda::args::__is_bounds_v); } // Static and runtime bounds intersection { - static_assert(cuda::argument::__has_bounds_intersection>( - cuda::argument::runtime_bounds{50, 200})); - static_assert(!cuda::argument::__has_bounds_intersection>( - cuda::argument::runtime_bounds{0, 50})); + static_assert(cuda::args::__has_bounds_intersection>( + cuda::args::runtime_bounds{50, 200})); + static_assert(!cuda::args::__has_bounds_intersection>( + cuda::args::runtime_bounds{0, 50})); } // Non-bounds type { - static_assert(!cuda::argument::__is_bounds_v); + static_assert(!cuda::args::__is_bounds_v); } // Bounds types accepted by argument wrapper template parameters { - static_assert(cuda::argument::__valid_static_bounds_v); - static_assert(cuda::argument::__valid_static_bounds_v>); - static_assert(!cuda::argument::__valid_static_bounds_v>); - static_assert(!cuda::argument::__valid_static_bounds_v); + static_assert(cuda::args::__valid_static_bounds_v); + static_assert(cuda::args::__valid_static_bounds_v>); + static_assert(!cuda::args::__valid_static_bounds_v>); + static_assert(!cuda::args::__valid_static_bounds_v); } return true; diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp index 632fe50945a..bf3aec4ed7b 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp @@ -50,128 +50,123 @@ TEST_FUNC void test() { // --- __is_sequence_v / __is_single_value_v --- - static_assert(!cuda::argument::__is_sequence_v); - static_assert(!cuda::argument::__is_sequence_v); - static_assert(!cuda::argument::__is_sequence_v); - static_assert(cuda::argument::__is_sequence_v); - static_assert(cuda::argument::__is_sequence_v>); - static_assert(cuda::argument::__is_sequence_v>); - static_assert(cuda::argument::__is_sequence_v&>); - static_assert(cuda::argument::__is_sequence_v>); - static_assert(cuda::argument::__is_sequence_v>); - static_assert(cuda::argument::__is_sequence_v>); - static_assert(cuda::argument::__is_sequence_v>); - static_assert(cuda::argument::__is_sequence_v>); - - static_assert(cuda::argument::__is_single_value_v); - static_assert(cuda::argument::__is_single_value_v); - static_assert(cuda::argument::__is_single_value_v); - static_assert(cuda::argument::__is_single_value_v); - static_assert(cuda::argument::__is_single_value_v); - static_assert(cuda::argument::__is_single_value_v); - static_assert(!cuda::argument::__is_single_value_v); - static_assert(!cuda::argument::__is_single_value_v>); - static_assert(!cuda::argument::__is_single_value_v>); - static_assert(!cuda::argument::__is_single_value_v&>); - static_assert(!cuda::argument::__is_single_value_v>); - static_assert(!cuda::argument::__is_single_value_v>); - static_assert(!cuda::argument::__is_single_value_v>); + static_assert(!cuda::args::__is_sequence_v); + static_assert(!cuda::args::__is_sequence_v); + static_assert(!cuda::args::__is_sequence_v); + static_assert(cuda::args::__is_sequence_v); + static_assert(cuda::args::__is_sequence_v>); + static_assert(cuda::args::__is_sequence_v>); + static_assert(cuda::args::__is_sequence_v&>); + static_assert(cuda::args::__is_sequence_v>); + static_assert(cuda::args::__is_sequence_v>); + static_assert(cuda::args::__is_sequence_v>); + static_assert(cuda::args::__is_sequence_v>); + static_assert(cuda::args::__is_sequence_v>); + + static_assert(cuda::args::__is_single_value_v); + static_assert(cuda::args::__is_single_value_v); + static_assert(cuda::args::__is_single_value_v); + static_assert(cuda::args::__is_single_value_v); + static_assert(cuda::args::__is_single_value_v); + static_assert(cuda::args::__is_single_value_v); + static_assert(!cuda::args::__is_single_value_v); + static_assert(!cuda::args::__is_single_value_v>); + static_assert(!cuda::args::__is_single_value_v>); + static_assert(!cuda::args::__is_single_value_v&>); + static_assert(!cuda::args::__is_single_value_v>); + static_assert(!cuda::args::__is_single_value_v>); + static_assert(!cuda::args::__is_single_value_v>); // --- __element_type_of_t --- - static_assert(cuda::std::is_same_v&>, int>); - static_assert(cuda::std::is_same_v, int>); - static_assert(cuda::std::is_same_v>, int>); - static_assert(cuda::std::is_same_v>, int>); - static_assert(cuda::std::is_same_v>, int>); - static_assert(cuda::std::is_same_v>, int>); + static_assert(cuda::std::is_same_v&>, int>); + static_assert(cuda::std::is_same_v, int>); + static_assert(cuda::std::is_same_v>, int>); + static_assert(cuda::std::is_same_v>, int>); + static_assert(cuda::std::is_same_v>, int>); + static_assert(cuda::std::is_same_v>, int>); static_assert( - cuda::std::is_same_v>>, - int>); - static_assert(cuda::std::is_same_v>, int>); + cuda::std::is_same_v>>, int>); + static_assert(cuda::std::is_same_v>, int>); // --- argument_traits: is_deferred --- - static_assert(!cuda::argument::__traits::is_deferred); - static_assert(!cuda::argument::__traits>::is_deferred); - static_assert(!cuda::argument::__traits>>::is_deferred); - static_assert(!cuda::argument::__traits>::is_deferred); + static_assert(!cuda::args::__traits::is_deferred); + static_assert(!cuda::args::__traits>::is_deferred); + static_assert(!cuda::args::__traits>>::is_deferred); + static_assert(!cuda::args::__traits>::is_deferred); #if TEST_HAS_CLASS_NTTP - static_assert( - !cuda::argument::__traits{1, 2, 3}>>::is_deferred); + static_assert(!cuda::args::__traits{1, 2, 3}>>::is_deferred); #endif // TEST_HAS_CLASS_NTTP - static_assert(cuda::argument::__traits>>::is_deferred); - static_assert(cuda::argument::__traits>>::is_deferred); + static_assert(cuda::args::__traits>>::is_deferred); + static_assert(cuda::args::__traits>>::is_deferred); // --- argument_traits: is_single_value --- - static_assert(cuda::argument::__traits::is_single_value); - static_assert(!cuda::argument::__traits::is_single_value); - static_assert(cuda::argument::__traits>::is_single_value); - static_assert(cuda::argument::__traits>::is_single_value); - static_assert(cuda::argument::__traits>>::is_single_value); - static_assert(!cuda::argument::__traits>>::is_single_value); - static_assert(cuda::argument::__traits>::is_single_value); + static_assert(cuda::args::__traits::is_single_value); + static_assert(!cuda::args::__traits::is_single_value); + static_assert(cuda::args::__traits>::is_single_value); + static_assert(cuda::args::__traits>::is_single_value); + static_assert(cuda::args::__traits>>::is_single_value); + static_assert(!cuda::args::__traits>>::is_single_value); + static_assert(cuda::args::__traits>::is_single_value); #if TEST_HAS_CLASS_NTTP static_assert( - !cuda::argument::__traits{1, 2, 3}>>::is_single_value); + !cuda::args::__traits{1, 2, 3}>>::is_single_value); #endif // TEST_HAS_CLASS_NTTP - static_assert(cuda::argument::__traits>::is_single_value); - static_assert(!cuda::argument::__traits>>::is_single_value); + static_assert(cuda::args::__traits>::is_single_value); + static_assert(!cuda::args::__traits>>::is_single_value); // --- argument_traits: value_type --- - static_assert(cuda::std::is_same_v::value_type, int>); - static_assert(cuda::std::is_same_v>::value_type, int>); + static_assert(cuda::std::is_same_v::value_type, int>); + static_assert(cuda::std::is_same_v>::value_type, int>); static_assert( - cuda::std::is_same_v>>::value_type, + cuda::std::is_same_v>>::value_type, cuda::std::span>); - static_assert(cuda::std::is_same_v>::value_type, int>); + static_assert(cuda::std::is_same_v>::value_type, int>); + static_assert(cuda::std::is_same_v>::value_type, float>); #if TEST_HAS_CLASS_NTTP - static_assert( - cuda::std::is_same_v< - cuda::argument::__traits{1, 2, 3}>>::value_type, - cuda::std::array>); + static_assert(cuda::std::is_same_v< + cuda::args::__traits{1, 2, 3}>>::value_type, + cuda::std::array>); #endif // TEST_HAS_CLASS_NTTP // --- argument_traits: lowest / highest --- - static_assert(cuda::argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::argument::__traits::highest == (cuda::std::numeric_limits::max)()); - static_assert(cuda::argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::argument::__traits::highest == (cuda::std::numeric_limits::max)()); - static_assert(cuda::argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::argument::__traits::highest == (cuda::std::numeric_limits::max)()); - static_assert( - cuda::argument::__traits>>::lowest == 1); + static_assert(cuda::args::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__traits::highest == (cuda::std::numeric_limits::max)()); + static_assert(cuda::args::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__traits::highest == (cuda::std::numeric_limits::max)()); + static_assert(cuda::args::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__traits::highest == (cuda::std::numeric_limits::max)()); + static_assert(cuda::args::__traits>>::lowest == 1); + static_assert(cuda::args::__traits>&>::highest == 8); static_assert( - cuda::argument::__traits>&>::highest == 8); - static_assert( - cuda::argument::__traits< - cuda::argument::immediate_sequence, cuda::argument::static_bounds<1, 8>>>::highest + cuda::args::__traits, cuda::args::static_bounds<1, 8>>>::highest == 8); + static_assert(cuda::args::__traits>::lowest == 10.0f); + static_assert(cuda::args::__traits>::highest == 10.0f); #if TEST_HAS_CLASS_NTTP - static_assert( - cuda::argument::__traits{3, 1, 2}>>::lowest == 1); - static_assert( - cuda::argument::__traits{3, 1, 2}>>::highest == 3); + static_assert(cuda::args::__traits{3, 1, 2}>>::lowest == 1); + static_assert(cuda::args::__traits{3, 1, 2}>>::highest == 3); #endif // TEST_HAS_CLASS_NTTP // --- Free function bounds on plain values --- - static_assert(cuda::argument::__lowest_(42) == cuda::std::numeric_limits::lowest()); - static_assert(cuda::argument::__highest_(42) == (cuda::std::numeric_limits::max)()); - static_assert(cuda::argument::__lowest_(1.0f) == cuda::std::numeric_limits::lowest()); - static_assert(cuda::argument::__highest_(1.0f) == (cuda::std::numeric_limits::max)()); + static_assert(cuda::args::__lowest_(42) == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__highest_(42) == (cuda::std::numeric_limits::max)()); + static_assert(cuda::args::__lowest_(1.0f) == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__highest_(1.0f) == (cuda::std::numeric_limits::max)()); // --- Scalar and sequence wrappers expose distinct single-value traits --- - static_assert(cuda::argument::__traits>::is_single_value); - static_assert(cuda::argument::__traits>::is_single_value); - static_assert(!cuda::argument::__traits>>::is_single_value); + static_assert(cuda::args::__traits>::is_single_value); + static_assert(cuda::args::__traits>::is_single_value); + static_assert(!cuda::args::__traits>>::is_single_value); #if TEST_HAS_CLASS_NTTP static_assert( - !cuda::argument::__traits{1, 2, 3}>>::is_single_value); + !cuda::args::__traits{1, 2, 3}>>::is_single_value); #endif // TEST_HAS_CLASS_NTTP } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp index 5dc2fb849c7..21b200e8f0f 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp @@ -22,110 +22,110 @@ TEST_FUNC constexpr bool test() // Deferred single value via span { int val = 42; - auto def = cuda::argument::deferred{cuda::std::span{&val, 1}}; - assert(cuda::argument::__unwrap(def)[0] == 42); - assert(def.__arg_[0] == 42); - static_assert(cuda::argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::argument::__traits::highest == (cuda::std::numeric_limits::max)()); + auto def = cuda::args::deferred{cuda::std::span{&val, 1}}; + assert(cuda::args::__unwrap(def)[0] == 42); + assert(cuda::args::__access::__arg(def)[0] == 42); + static_assert(cuda::args::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__traits::highest == (cuda::std::numeric_limits::max)()); } // Deferred single value with static bounds { int val = 42; - auto def = cuda::argument::deferred{cuda::std::span{&val, 1}, cuda::argument::bounds<1, 1000>()}; - assert(cuda::argument::__unwrap(def)[0] == 42); - static_assert(cuda::argument::__traits::lowest == 1); - static_assert(cuda::argument::__traits::highest == 1000); + auto def = cuda::args::deferred{cuda::std::span{&val, 1}, cuda::args::bounds<1, 1000>()}; + assert(cuda::args::__unwrap(def)[0] == 42); + static_assert(cuda::args::__traits::lowest == 1); + static_assert(cuda::args::__traits::highest == 1000); } // Deferred single value via pointer { int val = 42; - using def_t = cuda::argument::deferred>; - static_assert(cuda::argument::__traits::lowest == 0); - static_assert(cuda::argument::__traits::highest == 100); + using def_t = cuda::args::deferred>; + static_assert(cuda::args::__traits::lowest == 0); + static_assert(cuda::args::__traits::highest == 100); // Also verify construction works - auto def = cuda::argument::deferred{&val, cuda::argument::bounds<0, 100>()}; - assert(cuda::argument::__unwrap(def) == &val); + auto def = cuda::args::deferred{&val, cuda::args::bounds<0, 100>()}; + assert(cuda::args::__unwrap(def) == &val); } // Deferred single value via fancy iterator { auto it = cuda::counting_iterator{42}; - auto def = cuda::argument::deferred{it, cuda::argument::bounds<0, 100>()}; - assert(cuda::argument::__unwrap(def)[0] == 42); - static_assert(cuda::argument::__traits::lowest == 0); - static_assert(cuda::argument::__traits::highest == 100); - static_assert(cuda::argument::__traits::is_single_value); + auto def = cuda::args::deferred{it, cuda::args::bounds<0, 100>()}; + assert(cuda::args::__unwrap(def)[0] == 42); + static_assert(cuda::args::__traits::lowest == 0); + static_assert(cuda::args::__traits::highest == 100); + static_assert(cuda::args::__traits::is_single_value); } // Deferred single value with both bounds, runtime bounds first { - int val = 42; - auto def = cuda::argument::deferred{ - cuda::std::span{&val, 1}, cuda::argument::bounds(5, 100), cuda::argument::bounds<1, 256>()}; - static_assert(cuda::argument::__traits::lowest == 1); - static_assert(cuda::argument::__traits::highest == 256); - assert(def.__runtime_bounds_.__lower_ == 5); - assert(def.__runtime_bounds_.__upper_ == 100); - assert(cuda::argument::__lowest_(def) == 5); - assert(cuda::argument::__highest_(def) == 100); - def.__runtime_bounds_.__upper_ = 90; - assert(cuda::argument::__highest_(def) == 90); + int val = 42; + auto def = + cuda::args::deferred{cuda::std::span{&val, 1}, cuda::args::bounds(5, 100), cuda::args::bounds<1, 256>()}; + static_assert(cuda::args::__traits::lowest == 1); + static_assert(cuda::args::__traits::highest == 256); + assert(cuda::args::__access::__runtime_bounds(def).lower() == 5); + assert(cuda::args::__access::__runtime_bounds(def).upper() == 100); + assert(cuda::args::__lowest_(def) == 5); + assert(cuda::args::__highest_(def) == 100); + cuda::args::__access::__runtime_bounds(def) = cuda::args::bounds(5, 90); + assert(cuda::args::__highest_(def) == 90); } // Deferred sequence via fancy iterator { auto it = cuda::counting_iterator{10}; - auto def = cuda::argument::deferred_sequence{it, cuda::argument::bounds<0, 100>()}; - assert(cuda::argument::__unwrap(def)[0] == 10); - assert(cuda::argument::__unwrap(def)[2] == 12); - static_assert(cuda::argument::__traits::lowest == 0); - static_assert(cuda::argument::__traits::highest == 100); - static_assert(!cuda::argument::__traits::is_single_value); + auto def = cuda::args::deferred_sequence{it, cuda::args::bounds<0, 100>()}; + assert(cuda::args::__unwrap(def)[0] == 10); + assert(cuda::args::__unwrap(def)[2] == 12); + static_assert(cuda::args::__traits::lowest == 0); + static_assert(cuda::args::__traits::highest == 100); + static_assert(!cuda::args::__traits::is_single_value); } // Deferred sequence with both bounds { int arr[4] = {10, 20, 30, 40}; - auto def = cuda::argument::deferred_sequence{ - cuda::std::span{arr, 4}, cuda::argument::bounds<1, 4096>(), cuda::argument::bounds(5, 100)}; - assert(def.__arg_.size() == 4); - assert(def.__runtime_bounds_.__lower_ == 5); - assert(def.__runtime_bounds_.__upper_ == 100); - static_assert(cuda::argument::__traits::lowest == 1); - assert(cuda::argument::__lowest_(def) == 5); - assert(cuda::argument::__highest_(def) == 100); + auto def = cuda::args::deferred_sequence{ + cuda::std::span{arr, 4}, cuda::args::bounds<1, 4096>(), cuda::args::bounds(5, 100)}; + assert(cuda::args::__access::__arg(def).size() == 4); + assert(cuda::args::__access::__runtime_bounds(def).lower() == 5); + assert(cuda::args::__access::__runtime_bounds(def).upper() == 100); + static_assert(cuda::args::__traits::lowest == 1); + assert(cuda::args::__lowest_(def) == 5); + assert(cuda::args::__highest_(def) == 100); } // Deferred sequence with both bounds, runtime bounds first { int arr[4] = {10, 20, 30, 40}; - auto def = cuda::argument::deferred_sequence{ - cuda::std::span{arr, 4}, cuda::argument::bounds(5, 100), cuda::argument::bounds<1, 4096>()}; - static_assert(cuda::argument::__traits::lowest == 1); - static_assert(cuda::argument::__traits::highest == 4096); - assert(cuda::argument::__lowest_(def) == 5); - assert(cuda::argument::__highest_(def) == 100); + auto def = cuda::args::deferred_sequence{ + cuda::std::span{arr, 4}, cuda::args::bounds(5, 100), cuda::args::bounds<1, 4096>()}; + static_assert(cuda::args::__traits::lowest == 1); + static_assert(cuda::args::__traits::highest == 4096); + assert(cuda::args::__lowest_(def) == 5); + assert(cuda::args::__highest_(def) == 100); } // Traits: deferred is single value { - using traits = cuda::argument::__traits>>; + using traits = cuda::args::__traits>>; static_assert(traits::is_deferred); static_assert(traits::is_single_value); } // Traits: deferred with pointer is also single value { - using traits = cuda::argument::__traits>; + using traits = cuda::args::__traits>; static_assert(traits::is_deferred); static_assert(traits::is_single_value); } // Traits: deferred_sequence is not single value { - using traits = cuda::argument::__traits>>; + using traits = cuda::args::__traits>>; static_assert(traits::is_deferred); static_assert(!traits::is_single_value); } @@ -133,16 +133,16 @@ TEST_FUNC constexpr bool test() // Unwrap: deferred { int val = 99; - auto def = cuda::argument::deferred{cuda::std::span{&val, 1}}; - auto& v = cuda::argument::__unwrap(def); + auto def = cuda::args::deferred{cuda::std::span{&val, 1}}; + auto& v = cuda::args::__unwrap(def); assert(v[0] == 99); } // Unwrap: deferred_sequence { int arr[3] = {10, 20, 30}; - auto def = cuda::argument::deferred_sequence{cuda::std::span{arr, 3}}; - const auto& v = cuda::argument::__unwrap(def); + auto def = cuda::args::deferred_sequence{cuda::std::span{arr, 3}}; + const auto& v = cuda::args::__unwrap(def); assert(v.size() == 3); assert(v[1] == 20); } @@ -150,14 +150,14 @@ TEST_FUNC constexpr bool test() // Unwrap: rvalue deferred returns by value { int val = 99; - auto v = cuda::argument::__unwrap(cuda::argument::deferred{cuda::std::span{&val, 1}}); + auto v = cuda::args::__unwrap(cuda::args::deferred{cuda::std::span{&val, 1}}); assert(v[0] == 99); } // Unwrap: rvalue deferred_sequence returns by value { int arr[3] = {10, 20, 30}; - auto v = cuda::argument::__unwrap(cuda::argument::deferred_sequence{cuda::std::span{arr, 3}}); + auto v = cuda::args::__unwrap(cuda::args::deferred_sequence{cuda::std::span{arr, 3}}); assert(v.size() == 3); assert(v[2] == 30); } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp index 1ce371b84e5..64bad620293 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp @@ -10,7 +10,7 @@ #include -[[maybe_unused]] cuda::argument::deferred_sequence invalid_arg{0}; +[[maybe_unused]] cuda::args::deferred_sequence invalid_arg{0}; int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp index 41a9a2ae778..111bc226ae5 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp @@ -10,7 +10,7 @@ #include -using traits = cuda::argument::__traits>; +using traits = cuda::args::__traits>; [[maybe_unused]] constexpr bool invalid_traits = traits::is_deferred; diff --git a/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp index 23e867be616..ef27bdbec39 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp @@ -25,99 +25,99 @@ TEST_FUNC constexpr bool test() { // Uniform scalar via CTAD { - auto da = cuda::argument::immediate{5}; - assert(cuda::argument::__unwrap(da) == 5); - assert(da.__arg_ == 5); - static_assert(cuda::argument::__traits::lowest == cuda::std::numeric_limits::lowest()); - static_assert(cuda::argument::__traits::highest == (cuda::std::numeric_limits::max)()); - assert(cuda::argument::__lowest_(da) == 5); - assert(cuda::argument::__highest_(da) == 5); - da.__arg_ = 6; - assert(cuda::argument::__unwrap(da) == 6); + auto da = cuda::args::immediate{5}; + assert(cuda::args::__unwrap(da) == 5); + assert(cuda::args::__access::__arg(da) == 5); + static_assert(cuda::args::__traits::lowest == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__traits::highest == (cuda::std::numeric_limits::max)()); + assert(cuda::args::__lowest_(da) == 5); + assert(cuda::args::__highest_(da) == 5); + cuda::args::__access::__arg(da) = 6; + assert(cuda::args::__unwrap(da) == 6); } // Uniform scalar with static bounds { - auto da = cuda::argument::immediate{5, cuda::argument::bounds<1, 8>()}; - assert(cuda::argument::__unwrap(da) == 5); - static_assert(cuda::argument::__traits::lowest == 1); - static_assert(cuda::argument::__traits::highest == 8); - assert(cuda::argument::__lowest_(da) == 5); - assert(cuda::argument::__highest_(da) == 5); + auto da = cuda::args::immediate{5, cuda::args::bounds<1, 8>()}; + assert(cuda::args::__unwrap(da) == 5); + static_assert(cuda::args::__traits::lowest == 1); + static_assert(cuda::args::__traits::highest == 8); + assert(cuda::args::__lowest_(da) == 5); + assert(cuda::args::__highest_(da) == 5); } // Non-sequence values are accepted without scalar-only restrictions { - auto da = cuda::argument::immediate{non_sequence_value{7}}; - assert(cuda::argument::__unwrap(da).payload == 7); + auto da = cuda::args::immediate{non_sequence_value{7}}; + assert(cuda::args::__unwrap(da).payload == 7); } // Pointer-like types can still represent a single value when explicitly wrapped that way { int value = 11; - auto da = cuda::argument::immediate{&value}; - static_assert(cuda::argument::__traits::is_single_value); - assert(*cuda::argument::__unwrap(da) == 11); + auto da = cuda::args::immediate{&value}; + static_assert(cuda::args::__traits::is_single_value); + assert(*cuda::args::__unwrap(da) == 11); } // Per-segment span with runtime bounds { int arr[4] = {10, 20, 30, 40}; - auto da = cuda::argument::immediate_sequence{cuda::std::span{arr, 4}, cuda::argument::bounds(1L, 100L)}; - assert(cuda::argument::__unwrap(da).size() == 4); - assert(da.__arg_.size() == 4); - assert(da.__runtime_bounds_.__lower_ == 1); - assert(da.__runtime_bounds_.__upper_ == 100); - assert(cuda::argument::__lowest_(da) == 1); - assert(cuda::argument::__highest_(da) == 100); - da.__runtime_bounds_.__upper_ = 90; - assert(cuda::argument::__highest_(da) == 90); + auto da = cuda::args::__immediate_sequence{cuda::std::span{arr, 4}, cuda::args::bounds(1L, 100L)}; + assert(cuda::args::__unwrap(da).size() == 4); + assert(cuda::args::__access::__arg(da).size() == 4); + assert(cuda::args::__access::__runtime_bounds(da).lower() == 1); + assert(cuda::args::__access::__runtime_bounds(da).upper() == 100); + assert(cuda::args::__lowest_(da) == 1); + assert(cuda::args::__highest_(da) == 100); + cuda::args::__access::__runtime_bounds(da) = cuda::args::bounds(1, 90); + assert(cuda::args::__highest_(da) == 90); } // Per-segment span with both bounds { int arr[4] = {10, 20, 30, 40}; - auto da = cuda::argument::immediate_sequence{ - cuda::std::span{arr, 4}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(10, 200)}; - static_assert(cuda::argument::__traits::lowest == 1); - static_assert(cuda::argument::__traits::highest == 256); - assert(cuda::argument::__lowest_(da) == 10); - assert(cuda::argument::__highest_(da) == 200); + auto da = cuda::args::__immediate_sequence{ + cuda::std::span{arr, 4}, cuda::args::bounds<1, 256>(), cuda::args::bounds(10, 200)}; + static_assert(cuda::args::__traits::lowest == 1); + static_assert(cuda::args::__traits::highest == 256); + assert(cuda::args::__lowest_(da) == 10); + assert(cuda::args::__highest_(da) == 200); } // Per-segment span with both bounds, runtime bounds first { int arr[4] = {10, 20, 30, 40}; - auto da = cuda::argument::immediate_sequence{ - cuda::std::span{arr, 4}, cuda::argument::bounds(10, 200), cuda::argument::bounds<1, 256>()}; - static_assert(cuda::argument::__traits::lowest == 1); - static_assert(cuda::argument::__traits::highest == 256); - assert(cuda::argument::__lowest_(da) == 10); - assert(cuda::argument::__highest_(da) == 200); + auto da = cuda::args::__immediate_sequence{ + cuda::std::span{arr, 4}, cuda::args::bounds(10, 200), cuda::args::bounds<1, 256>()}; + static_assert(cuda::args::__traits::lowest == 1); + static_assert(cuda::args::__traits::highest == 256); + assert(cuda::args::__lowest_(da) == 10); + assert(cuda::args::__highest_(da) == 200); } // Per-segment via span { int arr[4] = {1, 2, 3, 4}; - auto da = cuda::argument::immediate_sequence{cuda::std::span{arr, 4}}; - assert(cuda::argument::__unwrap(da).size() == 4); - assert(cuda::argument::__unwrap(da)[0] == 1); - assert(cuda::argument::__unwrap(da)[3] == 4); + auto da = cuda::args::__immediate_sequence{cuda::std::span{arr, 4}}; + assert(cuda::args::__unwrap(da).size() == 4); + assert(cuda::args::__unwrap(da)[0] == 1); + assert(cuda::args::__unwrap(da)[3] == 4); } // Per-segment with static bounds { int arr[4] = {10, 20, 30, 40}; - auto da = cuda::argument::immediate_sequence{cuda::std::span{arr, 4}, cuda::argument::bounds<1, 100>()}; - assert(cuda::argument::__unwrap(da).size() == 4); - assert(cuda::argument::__unwrap(da)[2] == 30); - static_assert(cuda::argument::__traits::lowest == 1); - static_assert(cuda::argument::__traits::highest == 100); + auto da = cuda::args::__immediate_sequence{cuda::std::span{arr, 4}, cuda::args::bounds<1, 100>()}; + assert(cuda::args::__unwrap(da).size() == 4); + assert(cuda::args::__unwrap(da)[2] == 30); + static_assert(cuda::args::__traits::lowest == 1); + static_assert(cuda::args::__traits::highest == 100); } // Traits { - using traits = cuda::argument::__traits>; + using traits = cuda::args::__traits>; static_assert(!traits::is_deferred); static_assert(traits::is_single_value); static_assert(cuda::std::is_same_v); @@ -125,7 +125,7 @@ TEST_FUNC constexpr bool test() // Sequence traits { - using traits = cuda::argument::__traits>>; + using traits = cuda::args::__traits>>; static_assert(!traits::is_deferred); static_assert(!traits::is_single_value); static_assert(cuda::std::is_same_v>); @@ -133,39 +133,38 @@ TEST_FUNC constexpr bool test() // __is_single_value_v on unwrapped types { - static_assert( - cuda::argument::__is_single_value_v>::value_type>); - static_assert(!cuda::argument::__traits>>::is_single_value); + static_assert(cuda::args::__is_single_value_v>::value_type>); + static_assert(!cuda::args::__traits>>::is_single_value); } // Unwrap: scalar { - auto da = cuda::argument::immediate{7}; - auto& v = cuda::argument::__unwrap(da); + auto da = cuda::args::immediate{7}; + auto& v = cuda::args::__unwrap(da); assert(v == 7); v = 8; - assert(cuda::argument::__unwrap(da) == 8); + assert(cuda::args::__unwrap(da) == 8); } // Unwrap: span { int arr[3] = {10, 20, 30}; - auto da = cuda::argument::immediate_sequence{cuda::std::span{arr, 3}}; - const auto& v = cuda::argument::__unwrap(da); + auto da = cuda::args::__immediate_sequence{cuda::std::span{arr, 3}}; + const auto& v = cuda::args::__unwrap(da); assert(v.size() == 3); assert(v[1] == 20); } // Unwrap: rvalue scalar returns by value { - const auto& v = cuda::argument::__unwrap(cuda::argument::immediate{7}); + const auto& v = cuda::args::__unwrap(cuda::args::immediate{7}); assert(v == 7); } // Unwrap: rvalue span returns by value { int arr[3] = {10, 20, 30}; - auto v = cuda::argument::__unwrap(cuda::argument::immediate_sequence{cuda::std::span{arr, 3}}); + auto v = cuda::args::__unwrap(cuda::args::__immediate_sequence{cuda::std::span{arr, 3}}); assert(v.size() == 3); assert(v[2] == 30); } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp index 485a17fe72b..df463b0136d 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp @@ -24,72 +24,76 @@ TEST_FUNC void test() { // Basic value { - constexpr auto sa = cuda::argument::constant<42>{}; - static_assert(sa.value() == 42); + constexpr auto sa = cuda::args::constant<42>{}; + static_assert(cuda::args::__unwrap(sa) == 42); static_assert(cuda::std::is_same_v); } // Different types { - constexpr auto sa_long = cuda::argument::constant<100L>{}; - static_assert(sa_long.value() == 100L); + constexpr auto sa_long = cuda::args::constant<100L>{}; + static_assert(cuda::args::__unwrap(sa_long) == 100L); static_assert(cuda::std::is_same_v); + + constexpr auto sa_float = cuda::args::constant<10, float>{}; + static_assert(cuda::args::__unwrap(sa_float) == 10.0f); + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); } // Negative value { - constexpr auto sa_neg = cuda::argument::constant<-1>{}; - static_assert(sa_neg.value() == -1); + constexpr auto sa_neg = cuda::args::constant<-1>{}; + static_assert(cuda::args::__unwrap(sa_neg) == -1); } #if TEST_HAS_CLASS_NTTP // Non-sequence values are accepted without scalar-only restrictions { - constexpr auto sa = cuda::argument::constant{}; - static_assert(sa.value().payload == 7); - static_assert(cuda::argument::__unwrap(sa).payload == 7); + constexpr auto sa = cuda::args::constant{}; + static_assert(cuda::args::__unwrap(sa).payload == 7); } #endif // TEST_HAS_CLASS_NTTP #if TEST_HAS_CLASS_NTTP // Array sequence { - constexpr auto sa_arr = cuda::argument::constant_sequence{128, 256, 512}>{}; - static_assert(sa_arr.value()[0] == 128); - static_assert(sa_arr.value()[1] == 256); - static_assert(sa_arr.value()[2] == 512); + constexpr auto sa_arr = cuda::args::__constant_sequence{128, 256, 512}>{}; + static_assert(cuda::args::__unwrap(sa_arr)[0] == 128); + static_assert(cuda::args::__unwrap(sa_arr)[1] == 256); + static_assert(cuda::args::__unwrap(sa_arr)[2] == 512); static_assert(cuda::std::is_same_v>); } #endif // TEST_HAS_CLASS_NTTP // Bounds: scalar { - constexpr auto sa = cuda::argument::constant<42>{}; - static_assert(cuda::argument::__lowest_(sa) == 42); - static_assert(cuda::argument::__highest_(sa) == 42); + constexpr auto sa = cuda::args::constant<42>{}; + static_assert(cuda::args::__lowest_(sa) == 42); + static_assert(cuda::args::__highest_(sa) == 42); } #if TEST_HAS_CLASS_NTTP // Bounds: array sequence computes lowest/highest of elements { - constexpr auto sa = cuda::argument::constant_sequence{128, 256, 512}>{}; - static_assert(cuda::argument::__lowest_(sa) == 128); - static_assert(cuda::argument::__highest_(sa) == 512); + constexpr auto sa = cuda::args::__constant_sequence{128, 256, 512}>{}; + static_assert(cuda::args::__lowest_(sa) == 128); + static_assert(cuda::args::__highest_(sa) == 512); } #endif // TEST_HAS_CLASS_NTTP #if TEST_HAS_CLASS_NTTP // Bounds: empty array sequence has unconstrained element bounds { - constexpr auto sa = cuda::argument::constant_sequence{}>{}; - static_assert(cuda::argument::__lowest_(sa) == cuda::std::numeric_limits::lowest()); - static_assert(cuda::argument::__highest_(sa) == (cuda::std::numeric_limits::max)()); + constexpr auto sa = cuda::args::__constant_sequence{}>{}; + static_assert(cuda::args::__lowest_(sa) == cuda::std::numeric_limits::lowest()); + static_assert(cuda::args::__highest_(sa) == (cuda::std::numeric_limits::max)()); } #endif // TEST_HAS_CLASS_NTTP // Traits { - using traits = cuda::argument::__traits>; + using traits = cuda::args::__traits>; static_assert(!traits::is_deferred); static_assert(traits::is_constant); static_assert(traits::is_single_value); @@ -98,10 +102,22 @@ TEST_FUNC void test() static_assert(traits::highest == 42); } + // Traits: explicit constant value type + { + using traits = cuda::args::__traits>; + static_assert(!traits::is_deferred); + static_assert(traits::is_constant); + static_assert(traits::is_single_value); + static_assert(cuda::std::is_same_v); + static_assert(cuda::std::is_same_v); + static_assert(traits::lowest == 10.0f); + static_assert(traits::highest == 10.0f); + } + #if TEST_HAS_CLASS_NTTP // Sequence traits { - using traits = cuda::argument::__traits{1, 2, 3}>>; + using traits = cuda::args::__traits{1, 2, 3}>>; static_assert(traits::is_constant); static_assert(!traits::is_deferred); static_assert(!traits::is_single_value); @@ -112,26 +128,33 @@ TEST_FUNC void test() // Single value: scalar is single, sequence is not { - static_assert( - cuda::argument::__is_single_value_v>::value_type>); + static_assert(cuda::args::__is_single_value_v>::value_type>); #if TEST_HAS_CLASS_NTTP static_assert( - !cuda::argument::__traits{1, 2, 3}>>::is_single_value); + !cuda::args::__traits{1, 2, 3}>>::is_single_value); #endif // TEST_HAS_CLASS_NTTP } // Unwrap: scalar { - constexpr auto sa = cuda::argument::constant<42>{}; - constexpr auto val = cuda::argument::__unwrap(sa); + constexpr auto sa = cuda::args::constant<42>{}; + constexpr auto val = cuda::args::__unwrap(sa); static_assert(val == 42); } + // Unwrap: scalar with explicit value type + { + constexpr auto sa = cuda::args::constant<10, float>{}; + constexpr auto val = cuda::args::__unwrap(sa); + static_assert(val == 10.0f); + static_assert(cuda::std::is_same_v); + } + #if TEST_HAS_CLASS_NTTP // Unwrap: sequence { - constexpr auto sa = cuda::argument::constant_sequence{10, 20, 30}>{}; - constexpr auto val = cuda::argument::__unwrap(sa); + constexpr auto sa = cuda::args::__constant_sequence{10, 20, 30}>{}; + constexpr auto val = cuda::args::__unwrap(sa); static_assert(val[0] == 10); static_assert(val[2] == 30); } diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp index 66a1828769b..b59d41fd7a1 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp @@ -10,7 +10,7 @@ #include -using arg_t = cuda::argument::immediate>; +using arg_t = cuda::args::immediate>; [[maybe_unused]] arg_t invalid_arg{0}; diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp index 7f9902e50a2..5212c8a1f9c 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp @@ -10,9 +10,9 @@ #include -using arg_t = cuda::argument::immediate>; +using arg_t = cuda::args::immediate>; -[[maybe_unused]] constexpr auto invalid_highest = cuda::argument::__traits::highest; +[[maybe_unused]] constexpr auto invalid_highest = cuda::args::__traits::highest; int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp index 8170fdd7ee5..02ba7ecfe96 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp @@ -10,7 +10,7 @@ #include -[[maybe_unused]] constexpr auto invalid_bounds = cuda::argument::static_bounds<0, 1L>{}; +[[maybe_unused]] constexpr auto invalid_bounds = cuda::args::static_bounds<0, 1L>{}; int main(int, char**) { diff --git a/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp index f449975cb06..33b193edc8e 100644 --- a/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp +++ b/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp @@ -10,7 +10,7 @@ // Integration test: demonstrates how an algorithm consumes argument wrappers // to make compile-time and runtime resource decisions. -// All argument types (plain values, static, dynamic, deferred) work uniformly +// All argument types (plain values, constants, immediate values, deferred values) work uniformly // through the free functions. #include @@ -34,7 +34,7 @@ enum class algorithm_variant template TEST_FUNC constexpr algorithm_variant select_variant(_SegSizeArg) { - if constexpr (cuda::argument::__traits<_SegSizeArg>::highest <= shared_memory_capacity) + if constexpr (cuda::args::__traits<_SegSizeArg>::highest <= shared_memory_capacity) { return algorithm_variant::shared_memory; } @@ -48,7 +48,7 @@ TEST_FUNC constexpr algorithm_variant select_variant(_SegSizeArg) template TEST_FUNC constexpr int compute_buffer_size(_SegSizeArg __seg_size, int __num_segments) { - auto __highest = cuda::std::min(default_max_segment_size, static_cast(cuda::argument::__highest_(__seg_size))); + auto __highest = cuda::std::min(default_max_segment_size, static_cast(cuda::args::__highest_(__seg_size))); return __highest * __num_segments; } @@ -56,9 +56,9 @@ TEST_FUNC constexpr int compute_buffer_size(_SegSizeArg __seg_size, int __num_se template TEST_FUNC constexpr int process_segments(_SegSizeArg __seg_size) { - const auto& __val = cuda::argument::__unwrap(__seg_size); + const auto& __val = cuda::args::__unwrap(__seg_size); - if constexpr (cuda::argument::__traits<_SegSizeArg>::is_single_value) + if constexpr (cuda::args::__traits<_SegSizeArg>::is_single_value) { return static_cast(__val); } @@ -91,82 +91,82 @@ TEST_FUNC constexpr bool test() assert(process_segments(seg) == 64 + 128 + 96); } - // static_argument: scalar, fits in shared memory, buffer = value + // constant: scalar, fits in shared memory, buffer = value { - constexpr auto seg_size = cuda::argument::constant<128>{}; + constexpr auto seg_size = cuda::args::constant<128>{}; static_assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_size, 4) == 128 * 4); assert(process_segments(seg_size) == 128); } #if TEST_HAS_CLASS_NTTP - // static_argument: array sequence, highest fits in shared memory + // __constant_sequence: array sequence, highest fits in shared memory { - constexpr auto seg_sizes = cuda::argument::constant_sequence{}; + constexpr auto seg_sizes = cuda::args::__constant_sequence{}; static_assert(select_variant(seg_sizes) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_sizes, 3) == 256 * 3); assert(process_segments(seg_sizes) == 64 + 128 + 256); } - // static_argument: array sequence, highest exceeds shared memory, buffer clamped + // __constant_sequence: array sequence, highest exceeds shared memory, buffer clamped { - constexpr auto seg_sizes = cuda::argument::constant_sequence{}; + constexpr auto seg_sizes = cuda::args::__constant_sequence{}; static_assert(select_variant(seg_sizes) == algorithm_variant::global_memory); assert(compute_buffer_size(seg_sizes, 3) == 512 * 3); assert(process_segments(seg_sizes) == 64 + 128 + 512); } #endif // TEST_HAS_CLASS_NTTP - // dynamic_argument: tight static bounds, shared memory, buffer = value + // immediate: tight static bounds, shared memory, buffer = value { - constexpr auto seg_size = cuda::argument::immediate{100, cuda::argument::bounds<1, 256>()}; + constexpr auto seg_size = cuda::args::immediate{100, cuda::args::bounds<1, 256>()}; static_assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_size, 4) == 100 * 4); assert(process_segments(seg_size) == 100); } - // dynamic_argument: wide static bounds, global memory, buffer = value + // immediate: wide static bounds, global memory, buffer = value { - constexpr auto seg_size = cuda::argument::immediate{100, cuda::argument::bounds<1, 4096>()}; + constexpr auto seg_size = cuda::args::immediate{100, cuda::args::bounds<1, 4096>()}; static_assert(select_variant(seg_size) == algorithm_variant::global_memory); assert(compute_buffer_size(seg_size, 4) == 100 * 4); assert(process_segments(seg_size) == 100); } - // dynamic_argument: no bounds, global memory, buffer = value + // immediate: no bounds, global memory, buffer = value { - constexpr auto seg_size = cuda::argument::immediate{100}; + constexpr auto seg_size = cuda::args::immediate{100}; static_assert(select_variant(seg_size) == algorithm_variant::global_memory); assert(compute_buffer_size(seg_size, 4) == 100 * 4); assert(process_segments(seg_size) == 100); } - // dynamic_argument: per-segment span with runtime bounds only + // __immediate_sequence: per-segment span with runtime bounds only { int sizes[3] = {64, 128, 96}; - auto seg_sizes = cuda::argument::immediate_sequence{cuda::std::span{sizes, 3}, cuda::argument::bounds(1, 200)}; + auto seg_sizes = cuda::args::__immediate_sequence{cuda::std::span{sizes, 3}, cuda::args::bounds(1, 200)}; assert(select_variant(seg_sizes) == algorithm_variant::global_memory); assert(compute_buffer_size(seg_sizes, 3) == 200 * 3); assert(process_segments(seg_sizes) == 64 + 128 + 96); } - // dynamic_argument: per-segment span with both bounds + // __immediate_sequence: per-segment span with both bounds { int sizes[3] = {64, 128, 96}; - auto seg_sizes = cuda::argument::immediate_sequence{ - cuda::std::span{sizes, 3}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(1, 200)}; - static_assert(cuda::argument::__traits::highest <= shared_memory_capacity); + auto seg_sizes = cuda::args::__immediate_sequence{ + cuda::std::span{sizes, 3}, cuda::args::bounds<1, 256>(), cuda::args::bounds(1, 200)}; + static_assert(cuda::args::__traits::highest <= shared_memory_capacity); assert(select_variant(seg_sizes) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_sizes, 3) == 200 * 3); assert(process_segments(seg_sizes) == 64 + 128 + 96); } - // deferred_argument: uniform, bounds for decisions only + // deferred: uniform, bounds for decisions only { - int val = 100; - auto seg_size = cuda::argument::deferred{ - cuda::std::span{&val, 1}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(1, 200)}; - static_assert(cuda::argument::__traits::highest <= shared_memory_capacity); + int val = 100; + auto seg_size = + cuda::args::deferred{cuda::std::span{&val, 1}, cuda::args::bounds<1, 256>(), cuda::args::bounds(1, 200)}; + static_assert(cuda::args::__traits::highest <= shared_memory_capacity); assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(compute_buffer_size(seg_size, 4) == 200 * 4); } @@ -179,17 +179,24 @@ TEST_FUNC constexpr bool test() assert(process_segments(1.0f) == 1); } + // constant float using an integer NTTP and explicit value type + { + constexpr auto seg_size = cuda::args::constant<128, float>{}; + static_assert(select_variant(seg_size) == algorithm_variant::shared_memory); + assert(process_segments(seg_size) == 128); + } + #if TEST_HAS_CLASS_NTTP - // static_argument float (float NTTPs require C++20) + // constant float (float NTTPs require C++20) { - constexpr auto seg_size = cuda::argument::constant<128.0f>{}; + constexpr auto seg_size = cuda::args::constant<128.0f>{}; static_assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(process_segments(seg_size) == 128); } - // dynamic_argument float with static bounds + // immediate float with static bounds { - constexpr auto seg_size = cuda::argument::immediate{100.0f, cuda::argument::bounds<1.0f, 256.0f>()}; + constexpr auto seg_size = cuda::args::immediate{100.0f, cuda::args::bounds<1.0f, 256.0f>()}; static_assert(select_variant(seg_size) == algorithm_variant::shared_memory); assert(process_segments(seg_size) == 100); }