From 06c277f7a6fe8cdc30ae0442fd570ec53a45437e Mon Sep 17 00:00:00 2001
From: Piotr Ciolkosz <pciolkosz@nvidia.com>
Date: Wed, 3 Jun 2026 22:57:33 -0700
Subject: [PATCH 1/5] Make argument wrappers construction public

---
 .../bench/segmented_topk/fixed/keys.cu        |  12 +-
 .../bench/segmented_topk/variable/keys.cu     |  14 +-
 cub/cub/agent/agent_batched_topk.cuh          |  12 +-
 cub/cub/detail/segmented_params.cuh           |  30 ++--
 .../device/dispatch/dispatch_batched_topk.cuh |  20 +--
 .../dispatch/kernels/kernel_batched_topk.cuh  |  10 +-
 .../catch2_test_device_segmented_topk_keys.cu |  27 ++-
 ...catch2_test_device_segmented_topk_pairs.cu |  18 +-
 libcudacxx/include/cuda/__argument/argument.h | 160 ++++++++---------
 .../include/cuda/__argument/argument_bounds.h |   4 +-
 libcudacxx/include/cuda/argument              |  26 +++
 .../include/cuda/std/__internal/namespaces.h  |   4 +-
 .../cuda/argument/argument_bounds.pass.cpp    |  42 +++--
 .../cuda/argument/argument_traits.pass.cpp    | 168 +++++++++---------
 .../cuda/argument/deferred_argument.pass.cpp  | 110 ++++++------
 .../cuda/argument/dynamic_argument.pass.cpp   | 119 +++++++------
 .../cuda/argument/static_argument.pass.cpp    |  50 +++---
 .../static_bounds_conversion.fail.cpp         |   6 +-
 .../static_bounds_type_mismatch.fail.cpp      |   4 +-
 .../cuda/argument/usage_example.pass.cpp      |  43 +++--
 20 files changed, 457 insertions(+), 422 deletions(-)
 create mode 100644 libcudacxx/include/cuda/argument
diff --git a/cub/benchmarks/bench/segmented_topk/fixed/keys.cu b/cub/benchmarks/bench/segmented_topk/fixed/keys.cu
index b8f13469dce..4178e7ea0d5 100644
--- a/cub/benchmarks/bench/segmented_topk/fixed/keys.cu
+++ b/cub/benchmarks/bench/segmented_topk/fixed/keys.cu
@@ -4,7 +4,7 @@
 #include <cub/detail/choose_offset.cuh>
 #include <cub/device/dispatch/dispatch_batched_topk.cuh>
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 #include <cuda/iterator>
 
 #include <nvbench_helper.cuh>
@@ -51,7 +51,7 @@ void fixed_seg_size_topk_keys(
   const auto selected_elements = static_cast<::cuda::std::ptrdiff_t>(MaxNumSelected);
   const auto num_segments      = ::cuda::std::max<std::size_t>(1, (max_elements / segment_size));
   const auto elements          = num_segments * segment_size;
-  const auto total_num_items   = ::cuda::__argument::__immediate{static_cast<::cuda::std::int64_t>(elements)};
+  const auto total_num_items   = ::cuda::argument::immediate{static_cast<::cuda::std::int64_t>(elements)};
   const bit_entropy entropy    = str_to_entropy(state.get_string("Entropy"));
 
   // Skip workloads where k exceeds the segment size
@@ -68,9 +68,9 @@ void fixed_seg_size_topk_keys(
   auto d_keys_in      = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_in_ptr), segment_size);
   auto d_keys_out     = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_out_ptr), selected_elements);
 
-  auto segment_sizes    = ::cuda::__argument::__constant<MaxSegmentSize>{};
-  auto k                = ::cuda::__argument::__constant<MaxNumSelected>{};
-  auto select_direction = ::cuda::__argument::__constant<cub::detail::topk::select::max>{};
+  auto segment_sizes    = ::cuda::argument::constant<MaxSegmentSize>{};
+  auto k                = ::cuda::argument::constant<MaxNumSelected>{};
+  auto select_direction = ::cuda::argument::constant<cub::detail::topk::select::max>{};
 
   state.add_element_count(elements, "NumElements");
   state.add_element_count(segment_size, "SegmentSize");
@@ -99,7 +99,7 @@ void fixed_seg_size_topk_keys(
       segment_sizes,
       k,
       select_direction,
-      ::cuda::__argument::__immediate{static_cast<::cuda::std::int64_t>(num_segments)},
+      ::cuda::argument::immediate{static_cast<::cuda::std::int64_t>(num_segments)},
       total_num_items,
       env);
   });
diff --git a/cub/benchmarks/bench/segmented_topk/variable/keys.cu b/cub/benchmarks/bench/segmented_topk/variable/keys.cu
index 3db3da44976..d14394afc39 100644
--- a/cub/benchmarks/bench/segmented_topk/variable/keys.cu
+++ b/cub/benchmarks/bench/segmented_topk/variable/keys.cu
@@ -8,7 +8,7 @@
 #include <thrust/reduce.h>
 #include <thrust/tabulate.h>
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 #include <cuda/iterator>
 #include <cuda/random>
 #include <cuda/std/algorithm>
@@ -172,17 +172,17 @@ void variable_seg_size_topk_keys(nvbench::state& state,
     static_cast<cuda::std::int64_t>(MaxSegmentSize));
   const auto input_elements  = thrust::reduce(d_segment_sizes.begin(), d_segment_sizes.end());
   const auto output_elements = static_cast<std::size_t>(num_segments) * K;
-  const auto total_num_items = ::cuda::__argument::__immediate{static_cast<cuda::std::int64_t>(input_elements)};
+  const auto total_num_items = ::cuda::argument::immediate{static_cast<cuda::std::int64_t>(input_elements)};
 
   auto in_keys_buffer = gen_data<MaxSegmentSize, K>(
     num_segments, string_to_pattern(state.get_string("Pattern")), thrust::raw_pointer_cast(d_segment_sizes.data()));
   auto out_keys_buffer = thrust::device_vector<KeyT>(output_elements, thrust::no_init);
 
-  auto segment_sizes_param = ::cuda::__argument::__immediate_sequence{
-    thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::__argument::__bounds<1, MaxSegmentSize>()};
-  auto k_param            = ::cuda::__argument::__constant<K>{};
-  auto select_direction   = ::cuda::__argument::__constant<cub::detail::topk::select::max>{};
-  auto num_segments_param = ::cuda::__argument::__immediate{static_cast<cuda::std::int64_t>(num_segments)};
+  auto segment_sizes_param = ::cuda::argument::immediate_sequence{
+    thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::argument::bounds<1, MaxSegmentSize>()};
+  auto k_param            = ::cuda::argument::constant<K>{};
+  auto select_direction   = ::cuda::argument::constant<cub::detail::topk::select::max>{};
+  auto num_segments_param = ::cuda::argument::immediate{static_cast<cuda::std::int64_t>(num_segments)};
 
   auto d_keys_in = cuda::make_strided_iterator(
     cuda::make_counting_iterator(thrust::raw_pointer_cast(in_keys_buffer.data())),
diff --git a/cub/cub/agent/agent_batched_topk.cuh b/cub/cub/agent/agent_batched_topk.cuh
index c5ec20d26cd..bc10311d36d 100644
--- a/cub/cub/agent/agent_batched_topk.cuh
+++ b/cub/cub/agent/agent_batched_topk.cuh
@@ -23,8 +23,8 @@
 #include <cub/device/dispatch/tuning/tuning_batched_topk.cuh>
 #include <cub/util_type.cuh>
 
-#include <cuda/__argument_>
 #include <cuda/__cmath/ceil_div.h>
+#include <cuda/argument>
 
 CUB_NAMESPACE_BEGIN
 
@@ -73,8 +73,8 @@ struct agent_batched_topk_worker_per_segment
   using key_t   = it_value_t<key_it_t>;
   using value_t = it_value_t<value_it_t>;
 
-  using segment_size_val_t = typename ::cuda::__argument::__traits<SegmentSizeParameterT>::element_type;
-  using num_segments_val_t = typename ::cuda::__argument::__traits<NumSegmentsParameterT>::element_type;
+  using segment_size_val_t = typename ::cuda::argument::__traits<SegmentSizeParameterT>::element_type;
+  using num_segments_val_t = typename ::cuda::argument::__traits<NumSegmentsParameterT>::element_type;
   using counters_t         = batched_topk_counters<num_segments_val_t>;
 
   static constexpr auto policy                 = PolicyGetter{}();
@@ -95,7 +95,7 @@ struct agent_batched_topk_worker_per_segment
     multi_worker_per_segment_policy.threads_per_block * multi_worker_per_segment_policy.items_per_thread;
 
   // Check if there could be large segments present
-  static constexpr bool only_small_segments = ::cuda::__argument::__traits<SegmentSizeParameterT>::highest <= tile_size;
+  static constexpr bool only_small_segments = ::cuda::argument::__traits<SegmentSizeParameterT>::highest <= tile_size;
 
   // Check if we are dealing with keys-only or key-value pairs
   static constexpr bool is_keys_only = ::cuda::std::is_same_v<value_t, cub::NullType>;
@@ -196,8 +196,8 @@ struct agent_batched_topk_worker_per_segment
       return;
     }
 
-    constexpr bool is_full_tile = ::cuda::__argument::__traits<SegmentSizeParameterT>::is_constant
-                               && ::cuda::__argument::__traits<SegmentSizeParameterT>::lowest == tile_size;
+    constexpr bool is_full_tile = ::cuda::argument::__traits<SegmentSizeParameterT>::is_constant
+                               && ::cuda::argument::__traits<SegmentSizeParameterT>::lowest == tile_size;
 
     // Resolve Segment Parameters
     const auto segment_size = params::get_param(segment_sizes, segment_id);
diff --git a/cub/cub/detail/segmented_params.cuh b/cub/cub/detail/segmented_params.cuh
index fe5cc5c9162..e9a999a87b1 100644
--- a/cub/cub/detail/segmented_params.cuh
+++ b/cub/cub/detail/segmented_params.cuh
@@ -13,7 +13,7 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 #include <cuda/std/__type_traits/integral_constant.h>
 #include <cuda/std/__type_traits/remove_cvref.h>
 #include <cuda/std/__utility/forward.h>
@@ -33,10 +33,10 @@ namespace detail::params
 //! @param[in] __index Segment index to read for sequence arguments.
 //! @return The single argument value, or the sequence element at the given index.
 _CCCL_TEMPLATE(class _Tp, class _SegmentIndexT)
-_CCCL_REQUIRES((!::cuda::__argument::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) )
+_CCCL_REQUIRES((!::cuda::argument::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) )
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(_Tp&& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
 {
-  if constexpr (::cuda::__argument::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value)
+  if constexpr (::cuda::argument::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value)
   {
     return __arg;
   }
@@ -48,44 +48,44 @@ _CCCL_REQUIRES((!::cuda::__argument::__is_wrapper_v<::cuda::std::remove_cvref_t<
 
 template <auto _Value, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
-get_param(const ::cuda::__argument::__constant<_Value>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
+get_param(const ::cuda::argument::constant<_Value>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
 {
-  return ::cuda::__argument::__unwrap(__arg);
+  return ::cuda::argument::__unwrap(__arg);
 }
 
 template <auto _Value, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
-get_param(const ::cuda::__argument::__constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept
+get_param(const ::cuda::argument::constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept
 {
-  return ::cuda::__argument::__unwrap(__arg)[__index];
+  return ::cuda::argument::__unwrap(__arg)[__index];
 }
 
 template <class _Arg, class _StaticBounds, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(
-  const ::cuda::__argument::__immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
+  const ::cuda::argument::immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
 {
-  return ::cuda::__argument::__unwrap(__arg);
+  return ::cuda::argument::__unwrap(__arg);
 }
 
 template <class _Arg, class _StaticBounds, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
-get_param(const ::cuda::__argument::__immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
+get_param(const ::cuda::argument::immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
 {
-  return ::cuda::__argument::__unwrap(__arg)[__index];
+  return ::cuda::argument::__unwrap(__arg)[__index];
 }
 
 template <class _Arg, class _StaticBounds, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(
-  const ::cuda::__argument::__deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
+  const ::cuda::argument::deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
 {
-  return ::cuda::__argument::__unwrap(__arg);
+  return ::cuda::argument::__unwrap(__arg);
 }
 
 template <class _Arg, class _StaticBounds, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
-get_param(const ::cuda::__argument::__deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
+get_param(const ::cuda::argument::deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
 {
-  return ::cuda::__argument::__unwrap(__arg)[__index];
+  return ::cuda::argument::__unwrap(__arg)[__index];
 }
 
 // =====================================================================
diff --git a/cub/cub/device/dispatch/dispatch_batched_topk.cuh b/cub/cub/device/dispatch/dispatch_batched_topk.cuh
index d0f2d4eed0a..68a3a4f26b0 100644
--- a/cub/cub/device/dispatch/dispatch_batched_topk.cuh
+++ b/cub/cub/device/dispatch/dispatch_batched_topk.cuh
@@ -31,10 +31,10 @@
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
-#include <cuda/__argument_>
 #include <cuda/__cmath/ceil_div.h>
 #include <cuda/__iterator/counting_iterator.h>
 #include <cuda/__iterator/transform_iterator.h>
+#include <cuda/argument>
 #include <cuda/std/__functional/operations.h>
 #include <cuda/std/__type_traits/is_same.h>
 #include <cuda/std/__type_traits/remove_cv.h>
@@ -49,9 +49,9 @@ namespace detail::batched_topk
 // Internal: wrap user-facing select direction into discrete param for dispatch
 // -----------------------------------------------------------------------------
 
-// Uniform (compile-time): __constant<Dir> -> single-option uniform_discrete_param.
+// Uniform (compile-time): constant<Dir> -> single-option uniform_discrete_param.
 template <detail::topk::select Dir>
-[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::__argument::__constant<Dir>)
+[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::argument::constant<Dir>)
 {
   return params::uniform_discrete_param<detail::topk::select, Dir>{Dir};
 }
@@ -126,7 +126,7 @@ template <typename KeyInputItItT,
           typename PolicySelector = policy_selector_from_types<it_value_t<it_value_t<KeyInputItItT>>,
                                                                it_value_t<it_value_t<ValueInputItItT>>,
                                                                ::cuda::std::int64_t,
-                                                               ::cuda::__argument::__traits<KParameterT>::highest>>
+                                                               ::cuda::argument::__traits<KParameterT>::highest>>
 #if _CCCL_HAS_CONCEPTS()
   requires batched_topk_policy_selector<PolicySelector>
 #endif // _CCCL_HAS_CONCEPTS()
@@ -145,7 +145,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   cudaStream_t stream                             = nullptr,
   [[maybe_unused]] PolicySelector policy_selector = {})
 {
-  using large_segment_tile_offset_t = typename ::cuda::__argument::__traits<TotalNumItemsGuaranteeT>::element_type;
+  using large_segment_tile_offset_t = typename ::cuda::argument::__traits<TotalNumItemsGuaranteeT>::element_type;
 
   // Wrap the raw enum into the internal discrete param type
   auto select_directions          = wrap_select_direction(select_direction);
@@ -171,9 +171,9 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   static constexpr int worker_per_segment_tile_size =
     worker_per_segment_policy.threads_per_block * worker_per_segment_policy.items_per_thread;
   static constexpr bool any_small_segments =
-    ::cuda::__argument::__traits<SegmentSizeParameterT>::lowest <= worker_per_segment_tile_size;
+    ::cuda::argument::__traits<SegmentSizeParameterT>::lowest <= worker_per_segment_tile_size;
   static constexpr bool only_small_segments =
-    ::cuda::__argument::__traits<SegmentSizeParameterT>::highest <= worker_per_segment_tile_size;
+    ::cuda::argument::__traits<SegmentSizeParameterT>::highest <= worker_per_segment_tile_size;
 
   // Allocation layout:
   //   only_small_segments: [0] dummy.
@@ -183,7 +183,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   static constexpr int allocations_array_size     = only_small_segments ? 1 : (any_small_segments ? 3 : 2);
   size_t allocation_sizes[allocations_array_size] = {1};
 
-  using num_segments_val_t         = typename ::cuda::__argument::__traits<NumSegmentsParameterT>::element_type;
+  using num_segments_val_t         = typename ::cuda::argument::__traits<NumSegmentsParameterT>::element_type;
   using counters_t                 = batched_topk_counters<num_segments_val_t>;
   using segment_size_scan_offset_t = detail::choose_offset_t<num_segments_val_t>;
   using segment_size_scan_input_op_t =
@@ -239,7 +239,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
 
   // TODO (elstehle): support number of segments provided by device-accessible iterator
   // Only uniform number of segments are supported (i.e., we need to resolve the number of segments on the host)
-  static_assert(::cuda::__argument::__traits<NumSegmentsParameterT>::is_single_value,
+  static_assert(::cuda::argument::__traits<NumSegmentsParameterT>::is_single_value,
                 "Only uniform segment sizes are currently supported.");
 
   if constexpr (any_small_segments)
@@ -341,7 +341,7 @@ template <typename KeyInputItItT,
     policy_selector_from_types<it_value_t<it_value_t<KeyInputItItT>>,
                                it_value_t<it_value_t<ValueInputItItT>>,
                                ::cuda::std::int64_t,
-                               ::cuda::__argument::__traits<KParameterT>::highest>;
+                               ::cuda::argument::__traits<KParameterT>::highest>;
   return detail::dispatch_with_env_and_tuning<default_policy_selector>(
     env, [&](auto policy_selector, void* d_temp_storage, size_t& temp_storage_bytes, cudaStream_t stream) {
       return dispatch(
diff --git a/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh b/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh
index 1ff50dfaf67..9c904f95971 100644
--- a/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh
@@ -20,8 +20,8 @@
 #include <cub/device/dispatch/tuning/tuning_batched_topk.cuh>
 #include <cub/util_arch.cuh>
 
-#include <cuda/__argument_>
 #include <cuda/__device/compute_capability.h>
+#include <cuda/argument>
 
 CUB_NAMESPACE_BEGIN
 
@@ -39,7 +39,7 @@ private:
     worker_policy worker_per_segment_policy;
     multi_worker_policy multi_worker_per_segment_policy;
   };
-  static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::__argument::__traits<SegmentSizeParameterT>::highest;
+  static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::argument::__traits<SegmentSizeParameterT>::highest;
   static constexpr batched_topk_policy active_policy     = current_policy<PolicySelector>();
 
   template <int Index>
@@ -133,8 +133,8 @@ __launch_bounds__(int(
     KParameterT k,
     SelectDirectionParameterT select_directions,
     NumSegmentsParameterT num_segments,
-    batched_topk_counters<typename ::cuda::__argument::__traits<NumSegmentsParameterT>::element_type>* d_counters,
-    typename ::cuda::__argument::__traits<NumSegmentsParameterT>::element_type* d_large_segments_ids,
+    batched_topk_counters<typename ::cuda::argument::__traits<NumSegmentsParameterT>::element_type>* d_counters,
+    typename ::cuda::argument::__traits<NumSegmentsParameterT>::element_type* d_large_segments_ids,
     LargeSegmentTileOffsetT* d_large_segments_tile_offsets)
 {
   using agent_t = typename find_smallest_covering_policy<
@@ -151,7 +151,7 @@ __launch_bounds__(int(
     LargeSegmentTileOffsetT>::agent_t;
 
   // Static Assertions (Constraints)
-  static_assert(agent_t::tile_size >= ::cuda::__argument::__traits<SegmentSizeParameterT>::highest,
+  static_assert(agent_t::tile_size >= ::cuda::argument::__traits<SegmentSizeParameterT>::highest,
                 "Block size exceeds maximum segment size supported by SegmentSizeParameterT");
   static_assert(sizeof(typename agent_t::TempStorage) <= max_smem_per_block,
                 "Static shared memory per block must not exceed 48KB limit.");
diff --git a/cub/test/catch2_test_device_segmented_topk_keys.cu b/cub/test/catch2_test_device_segmented_topk_keys.cu
index 3d00c1119cc..e16d5637f19 100644
--- a/cub/test/catch2_test_device_segmented_topk_keys.cu
+++ b/cub/test/catch2_test_device_segmented_topk_keys.cu
@@ -151,11 +151,11 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small fixed-size segments",
   batched_topk_keys(
     d_keys_in,
     d_keys_out,
-    ::cuda::__argument::__immediate{segment_size, ::cuda::__argument::__bounds<segment_size_t{1}, max_segment_size>()},
-    ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds<segment_size_t{1}, static_max_k>()},
+    ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds<segment_size_t{1}, max_segment_size>()},
+    ::cuda::argument::immediate{k, ::cuda::argument::bounds<segment_size_t{1}, static_max_k>()},
     direction,
-    ::cuda::__argument::__immediate{num_segments},
-    ::cuda::__argument::__immediate{num_segments * segment_size});
+    ::cuda::argument::immediate{num_segments},
+    ::cuda::argument::immediate{num_segments * segment_size});
   // Prepare expected results
   fixed_size_segmented_sort_keys(expected_keys, num_segments, segment_size, direction);
   compact_sorted_keys_to_topk(expected_keys, segment_size, k);
@@ -248,12 +248,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small variable-size segment
   batched_topk_keys(
     d_keys_in,
     d_keys_out,
-    ::cuda::__argument::__immediate_sequence{
-      segment_size_it, ::cuda::__argument::__bounds<segment_size_t{1}, static_max_segment_size>()},
-    ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds<segment_size_t{1}, static_max_k>()},
+    ::cuda::argument::immediate_sequence{
+      segment_size_it, ::cuda::argument::bounds<segment_size_t{1}, static_max_segment_size>()},
+    ::cuda::argument::immediate{k, ::cuda::argument::bounds<segment_size_t{1}, static_max_k>()},
     direction,
-    ::cuda::__argument::__immediate{num_segments},
-    ::cuda::__argument::__immediate{num_items});
+    ::cuda::argument::immediate{num_segments},
+    ::cuda::argument::immediate{num_items});
 
   // Verify keys are returned correctly: sort each segment of the expected input, then compact the top-k
   segmented_sort_keys(expected_keys, num_segments, segment_offsets.cbegin(), segment_offsets.cbegin() + 1, direction);
@@ -286,12 +286,11 @@ C2H_TEST("DeviceBatchedTopK::MinKeys preserves -0.0f in output", "[keys][segment
   batched_topk_keys(
     d_keys_in_it,
     d_keys_out_it,
-    ::cuda::__argument::__immediate{
-      segment_size, ::cuda::__argument::__bounds<cuda::std::int64_t{1}, max_segment_size>()},
-    ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds<cuda::std::int64_t{1}, k>()},
+    ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds<cuda::std::int64_t{1}, max_segment_size>()},
+    ::cuda::argument::immediate{k, ::cuda::argument::bounds<cuda::std::int64_t{1}, k>()},
     cub::detail::topk::select::min,
-    ::cuda::__argument::__immediate{num_segments},
-    ::cuda::__argument::__immediate{num_segments * segment_size});
+    ::cuda::argument::immediate{num_segments},
+    ::cuda::argument::immediate{num_segments * segment_size});
 
   const int num_minus_zero = static_cast<int>(thrust::count_if(d_keys_out.begin(), d_keys_out.end(), is_minus_zero{}));
   REQUIRE(num_minus_zero >= 1);
diff --git a/cub/test/catch2_test_device_segmented_topk_pairs.cu b/cub/test/catch2_test_device_segmented_topk_pairs.cu
index cc34ceba3c6..eb2825f15d2 100644
--- a/cub/test/catch2_test_device_segmented_topk_pairs.cu
+++ b/cub/test/catch2_test_device_segmented_topk_pairs.cu
@@ -220,11 +220,11 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small fixed-size segments"
     d_keys_out,
     d_values_in,
     d_values_out,
-    ::cuda::__argument::__immediate{segment_size, ::cuda::__argument::__bounds<segment_size_t{1}, max_segment_size>()},
-    ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds<segment_size_t{1}, static_max_k>()},
+    ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds<segment_size_t{1}, max_segment_size>()},
+    ::cuda::argument::immediate{k, ::cuda::argument::bounds<segment_size_t{1}, static_max_k>()},
     direction,
-    ::cuda::__argument::__immediate{num_segments},
-    ::cuda::__argument::__immediate{num_segments * segment_size});
+    ::cuda::argument::immediate{num_segments},
+    ::cuda::argument::immediate{num_segments * segment_size});
 
   // Verification:
   // - We verify correct top-k selection through the keys
@@ -340,12 +340,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small variable-size segmen
     d_keys_out,
     d_values_in,
     d_values_out,
-    ::cuda::__argument::__immediate_sequence{
-      segment_size_it, ::cuda::__argument::__bounds<segment_size_t{1}, static_max_segment_size>()},
-    ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds<segment_size_t{1}, static_max_k>()},
+    ::cuda::argument::immediate_sequence{
+      segment_size_it, ::cuda::argument::bounds<segment_size_t{1}, static_max_segment_size>()},
+    ::cuda::argument::immediate{k, ::cuda::argument::bounds<segment_size_t{1}, static_max_k>()},
     direction,
-    ::cuda::__argument::__immediate{num_segments},
-    ::cuda::__argument::__immediate{num_items});
+    ::cuda::argument::immediate{num_segments},
+    ::cuda::argument::immediate{num_items});
 
   // Verification:
   // - We verify correct top-k selection through the keys
diff --git a/libcudacxx/include/cuda/__argument/argument.h b/libcudacxx/include/cuda/__argument/argument.h
index fb514d22648..42d66233c6e 100644
--- a/libcudacxx/include/cuda/__argument/argument.h
+++ b/libcudacxx/include/cuda/__argument/argument.h
@@ -94,7 +94,7 @@ inline constexpr bool __is_iterable_v<_Tp,
                                                           decltype(::cuda::std::declval<const _Tp&>().end())>> = true;
 
 // =====================================================================
-// __constant
+// constant
 // =====================================================================
 
 // Non-sequence wrappers intentionally do not reject types with a distinct element type.
@@ -103,7 +103,7 @@ inline constexpr bool __is_iterable_v<_Tp,
 
 //! @brief Wraps a compile-time constant argument value.
 template <auto _Value>
-struct __constant
+struct constant
 {
   using value_type     = ::cuda::std::remove_cvref_t<decltype(_Value)>;
   using __element_type = value_type;
@@ -116,7 +116,7 @@ struct __constant
 
 //! @brief Wraps a compile-time constant argument sequence.
 template <auto _Value>
-struct __constant_sequence
+struct constant_sequence
 {
   using value_type     = ::cuda::std::remove_cvref_t<decltype(_Value)>;
   using __element_type = __element_type_of_t<value_type>;
@@ -254,14 +254,14 @@ _CCCL_API constexpr void __validate_runtime_element_bounds(
 }
 
 // =====================================================================
-// __immediate
+// immediate
 // =====================================================================
 
 //! @brief Wraps a runtime argument value with optional bounds.
 //!
 //! The value is host-accessible at API call time.
 template <class _Arg, class _StaticBounds = __no_bounds>
-struct __immediate
+struct immediate
 {
   using __element_type = __element_type_of_t<_Arg>;
 
@@ -281,13 +281,13 @@ struct __immediate
   }
 
 public:
-  _CCCL_API constexpr __immediate(_Arg __arg) noexcept
+  _CCCL_API constexpr immediate(_Arg __arg) noexcept
       : __arg_{::cuda::std::move(__arg)}
   {
     __validate_value();
   }
 
-  _CCCL_API constexpr __immediate(_Arg __arg, _StaticBounds) noexcept
+  _CCCL_API constexpr immediate(_Arg __arg, _StaticBounds) noexcept
       : __arg_{::cuda::std::move(__arg)}
   {
     __validate_value();
@@ -296,18 +296,18 @@ struct __immediate
 
 #ifndef _CCCL_DOXYGEN_INVOKED
 template <class _Arg, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE __immediate(_Arg, __static_bounds<_Lowest, _Highest>)
-  -> __immediate<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE immediate(_Arg, __static_bounds<_Lowest, _Highest>)
+  -> immediate<_Arg, __static_bounds<_Lowest, _Highest>>;
 
 #endif // _CCCL_DOXYGEN_INVOKED
 
 // =====================================================================
-// __immediate_sequence
+// immediate_sequence
 // =====================================================================
 
 //! @brief Wraps a runtime argument sequence with optional bounds.
 template <class _Arg, class _StaticBounds = __no_bounds>
-struct __immediate_sequence
+struct immediate_sequence
 {
   using __element_type = __element_type_of_t<_Arg>;
 
@@ -342,14 +342,14 @@ struct __immediate_sequence
   }
 
 public:
-  _CCCL_API constexpr __immediate_sequence(_Arg __arg) noexcept
+  _CCCL_API constexpr immediate_sequence(_Arg __arg) noexcept
       : __arg_{::cuda::std::move(__arg)}
   {
     __validate_bounds();
     __validate_value();
   }
 
-  _CCCL_API constexpr __immediate_sequence(_Arg __arg, _StaticBounds) noexcept
+  _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds) noexcept
       : __arg_{::cuda::std::move(__arg)}
   {
     __validate_bounds();
@@ -357,7 +357,7 @@ struct __immediate_sequence
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr __immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb) noexcept
+  _CCCL_API constexpr immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb) noexcept
       : __arg_{::cuda::std::move(__arg)}
       , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()),
                           __runtime_bound_cast<__element_type>(__rb.upper())}
@@ -367,7 +367,7 @@ struct __immediate_sequence
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr __immediate_sequence(_Arg __arg, _StaticBounds, __runtime_bounds<_BoundsTp> __rb) noexcept
+  _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds, __runtime_bounds<_BoundsTp> __rb) noexcept
       : __arg_{::cuda::std::move(__arg)}
       , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()),
                           __runtime_bound_cast<__element_type>(__rb.upper())}
@@ -377,27 +377,27 @@ struct __immediate_sequence
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr __immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept
-      : __immediate_sequence(::cuda::std::move(__arg), __sb, __rb)
+  _CCCL_API constexpr immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept
+      : immediate_sequence(::cuda::std::move(__arg), __sb, __rb)
   {}
 };
 
 #ifndef _CCCL_DOXYGEN_INVOKED
 template <class _Arg, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE __immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>)
-  -> __immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>)
+  -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, auto _Lowest, auto _Highest, class _Tp>
-_CCCL_HOST_DEVICE __immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>)
-  -> __immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>)
+  -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, class _Tp, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE __immediate_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>)
-  -> __immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE immediate_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>)
+  -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
 #endif // _CCCL_DOXYGEN_INVOKED
 
 // =====================================================================
-// __deferred_base / __deferred / __deferred_sequence
+// __deferred_base / deferred / deferred_sequence
 // =====================================================================
 
 //! @brief Common base for deferred argument wrappers.
@@ -451,57 +451,57 @@ struct __deferred_base
 //! @brief Wraps a reference to a single value that is potentially not available at API call time but will be available
 //! by the time the argument is consumed in stream order.
 template <class _Arg, class _StaticBounds = __no_bounds>
-struct __deferred : __deferred_base<_Arg, _StaticBounds>
+struct deferred : __deferred_base<_Arg, _StaticBounds>
 {
   using __deferred_base<_Arg, _StaticBounds>::__deferred_base;
 };
 
 #ifndef _CCCL_DOXYGEN_INVOKED
 template <class _Arg>
-_CCCL_HOST_DEVICE __deferred(_Arg) -> __deferred<_Arg>;
+_CCCL_HOST_DEVICE deferred(_Arg) -> deferred<_Arg>;
 
 template <class _Arg, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE __deferred(_Arg, __static_bounds<_Lowest, _Highest>)
-  -> __deferred<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred(_Arg, __static_bounds<_Lowest, _Highest>)
+  -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, class _Tp>
-_CCCL_HOST_DEVICE __deferred(_Arg, __runtime_bounds<_Tp>) -> __deferred<_Arg>;
+_CCCL_HOST_DEVICE deferred(_Arg, __runtime_bounds<_Tp>) -> deferred<_Arg>;
 
 template <class _Arg, auto _Lowest, auto _Highest, class _Tp>
-_CCCL_HOST_DEVICE __deferred(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>)
-  -> __deferred<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>)
+  -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, class _Tp, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE __deferred(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>)
-  -> __deferred<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>)
+  -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>;
 #endif // _CCCL_DOXYGEN_INVOKED
 
 //! @brief Wraps a reference to a sequence of values that is potentially not available at API call time but will be
 //! available by the time the argument is consumed in stream order.
 template <class _Arg, class _StaticBounds = __no_bounds>
-struct __deferred_sequence : __deferred_base<_Arg, _StaticBounds>
+struct deferred_sequence : __deferred_base<_Arg, _StaticBounds>
 {
   using __deferred_base<_Arg, _StaticBounds>::__deferred_base;
 };
 
 #ifndef _CCCL_DOXYGEN_INVOKED
 template <class _Arg>
-_CCCL_HOST_DEVICE __deferred_sequence(_Arg) -> __deferred_sequence<_Arg>;
+_CCCL_HOST_DEVICE deferred_sequence(_Arg) -> deferred_sequence<_Arg>;
 
 template <class _Arg, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE __deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>)
-  -> __deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>)
+  -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, class _Tp>
-_CCCL_HOST_DEVICE __deferred_sequence(_Arg, __runtime_bounds<_Tp>) -> __deferred_sequence<_Arg>;
+_CCCL_HOST_DEVICE deferred_sequence(_Arg, __runtime_bounds<_Tp>) -> deferred_sequence<_Arg>;
 
 template <class _Arg, auto _Lowest, auto _Highest, class _Tp>
-_CCCL_HOST_DEVICE __deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>)
-  -> __deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>)
+  -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, class _Tp, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE __deferred_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>)
-  -> __deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>)
+  -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
 #endif // _CCCL_DOXYGEN_INVOKED
 
 // =====================================================================
@@ -511,17 +511,17 @@ _CCCL_HOST_DEVICE __deferred_sequence(_Arg, __runtime_bounds<_Tp>, __static_boun
 template <class _Tp>
 inline constexpr bool __is_wrapper_v = false;
 template <class _Arg, class _StaticBounds>
-inline constexpr bool __is_wrapper_v<__immediate<_Arg, _StaticBounds>> = true;
+inline constexpr bool __is_wrapper_v<immediate<_Arg, _StaticBounds>> = true;
 template <auto _Value>
-inline constexpr bool __is_wrapper_v<__constant<_Value>> = true;
+inline constexpr bool __is_wrapper_v<constant<_Value>> = true;
 template <auto _Value>
-inline constexpr bool __is_wrapper_v<__constant_sequence<_Value>> = true;
+inline constexpr bool __is_wrapper_v<constant_sequence<_Value>> = true;
 template <class _Arg, class _StaticBounds>
-inline constexpr bool __is_wrapper_v<__immediate_sequence<_Arg, _StaticBounds>> = true;
+inline constexpr bool __is_wrapper_v<immediate_sequence<_Arg, _StaticBounds>> = true;
 template <class _Arg, class _StaticBounds>
-inline constexpr bool __is_wrapper_v<__deferred<_Arg, _StaticBounds>> = true;
+inline constexpr bool __is_wrapper_v<deferred<_Arg, _StaticBounds>> = true;
 template <class _Arg, class _StaticBounds>
-inline constexpr bool __is_wrapper_v<__deferred_sequence<_Arg, _StaticBounds>> = true;
+inline constexpr bool __is_wrapper_v<deferred_sequence<_Arg, _StaticBounds>> = true;
 
 _CCCL_TEMPLATE(class _Tp)
 _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) )
@@ -531,87 +531,87 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) )
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__immediate<_Arg, _StaticBounds>& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(immediate<_Arg, _StaticBounds>& __arg) noexcept
 {
   return __arg.__arg_;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __immediate<_Arg, _StaticBounds>& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const immediate<_Arg, _StaticBounds>& __arg) noexcept
 {
   return __arg.__arg_;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__immediate<_Arg, _StaticBounds>&& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(immediate<_Arg, _StaticBounds>&& __arg) noexcept
 {
   return ::cuda::std::move(__arg.__arg_);
 }
 
 template <auto _Value>
 [[nodiscard]] _CCCL_API constexpr ::cuda::std::remove_cvref_t<decltype(_Value)>
-__unwrap(const __constant<_Value>&) noexcept
+__unwrap(const constant<_Value>&) noexcept
 {
   return _Value;
 }
 
 template <auto _Value>
 [[nodiscard]] _CCCL_API constexpr ::cuda::std::remove_cvref_t<decltype(_Value)>
-__unwrap(const __constant_sequence<_Value>&) noexcept
+__unwrap(const constant_sequence<_Value>&) noexcept
 {
   return _Value;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept
 {
   return __arg.__arg_;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept
 {
   return __arg.__arg_;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__immediate_sequence<_Arg, _StaticBounds>&& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(immediate_sequence<_Arg, _StaticBounds>&& __arg) noexcept
 {
   return ::cuda::std::move(__arg.__arg_);
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__deferred<_Arg, _StaticBounds>& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(deferred<_Arg, _StaticBounds>& __arg) noexcept
 {
   return __arg.__arg_;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __deferred<_Arg, _StaticBounds>& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const deferred<_Arg, _StaticBounds>& __arg) noexcept
 {
   return __arg.__arg_;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__deferred<_Arg, _StaticBounds>&& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(deferred<_Arg, _StaticBounds>&& __arg) noexcept
 {
   return ::cuda::std::move(__arg.__arg_);
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept
 {
   return __arg.__arg_;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept
 {
   return __arg.__arg_;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__deferred_sequence<_Arg, _StaticBounds>&& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(deferred_sequence<_Arg, _StaticBounds>&& __arg) noexcept
 {
   return ::cuda::std::move(__arg.__arg_);
 }
@@ -677,7 +677,7 @@ struct __traits_impl
 };
 
 template <auto _Value>
-struct __traits_impl<__constant<_Value>>
+struct __traits_impl<constant<_Value>>
 {
   using value_type                      = ::cuda::std::remove_cvref_t<decltype(_Value)>;
   using element_type                    = value_type;
@@ -689,7 +689,7 @@ struct __traits_impl<__constant<_Value>>
 };
 
 template <class _Arg, class _StaticBounds>
-struct __traits_impl<__immediate<_Arg, _StaticBounds>>
+struct __traits_impl<immediate<_Arg, _StaticBounds>>
 {
   using value_type   = _Arg;
   using element_type = __element_type_of_t<_Arg>;
@@ -704,7 +704,7 @@ struct __traits_impl<__immediate<_Arg, _StaticBounds>>
 };
 
 template <auto _Value>
-struct __traits_impl<__constant_sequence<_Value>>
+struct __traits_impl<constant_sequence<_Value>>
 {
   using value_type   = ::cuda::std::remove_cvref_t<decltype(_Value)>;
   using element_type = __element_type_of_t<value_type>;
@@ -717,7 +717,7 @@ struct __traits_impl<__constant_sequence<_Value>>
 };
 
 template <class _Arg, class _StaticBounds>
-struct __traits_impl<__immediate_sequence<_Arg, _StaticBounds>>
+struct __traits_impl<immediate_sequence<_Arg, _StaticBounds>>
 {
   using value_type   = _Arg;
   using element_type = __element_type_of_t<_Arg>;
@@ -733,7 +733,7 @@ struct __traits_impl<__immediate_sequence<_Arg, _StaticBounds>>
 };
 
 template <class _Arg, class _StaticBounds>
-struct __traits_impl<__deferred<_Arg, _StaticBounds>>
+struct __traits_impl<deferred<_Arg, _StaticBounds>>
 {
   using value_type   = _Arg;
   using element_type = __element_type_of_t<_Arg>;
@@ -748,7 +748,7 @@ struct __traits_impl<__deferred<_Arg, _StaticBounds>>
 };
 
 template <class _Arg, class _StaticBounds>
-struct __traits_impl<__deferred_sequence<_Arg, _StaticBounds>>
+struct __traits_impl<deferred_sequence<_Arg, _StaticBounds>>
 {
   using value_type   = _Arg;
   using element_type = __element_type_of_t<_Arg>;
@@ -779,25 +779,25 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cv_t<_Tp>>) )
 }
 
 template <auto _Value>
-[[nodiscard]] _CCCL_API constexpr auto __lowest_(__constant<_Value>) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __lowest_(constant<_Value>) noexcept
 {
   return __constant_compute_lowest<_Value>();
 }
 
 template <auto _Value>
-[[nodiscard]] _CCCL_API constexpr auto __lowest_(__constant_sequence<_Value>) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __lowest_(constant_sequence<_Value>) noexcept
 {
   return __constant_sequence_compute_lowest<_Value>();
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr auto __lowest_(__immediate<_Arg, _StaticBounds> __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __lowest_(immediate<_Arg, _StaticBounds> __arg) noexcept
 {
   return __arg.__arg_;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr auto __lowest_(__immediate_sequence<_Arg, _StaticBounds> __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __lowest_(immediate_sequence<_Arg, _StaticBounds> __arg) noexcept
 {
   using _ET = __element_type_of_t<_Arg>;
   __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
@@ -805,7 +805,7 @@ template <class _Arg, class _StaticBounds>
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr auto __lowest_(__deferred<_Arg, _StaticBounds> __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __lowest_(deferred<_Arg, _StaticBounds> __arg) noexcept
 {
   using _ET = __element_type_of_t<_Arg>;
   __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
@@ -813,7 +813,7 @@ template <class _Arg, class _StaticBounds>
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr auto __lowest_(__deferred_sequence<_Arg, _StaticBounds> __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __lowest_(deferred_sequence<_Arg, _StaticBounds> __arg) noexcept
 {
   using _ET = __element_type_of_t<_Arg>;
   __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
@@ -829,25 +829,25 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cv_t<_Tp>>) )
 }
 
 template <auto _Value>
-[[nodiscard]] _CCCL_API constexpr auto __highest_(__constant<_Value>) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __highest_(constant<_Value>) noexcept
 {
   return __constant_compute_highest<_Value>();
 }
 
 template <auto _Value>
-[[nodiscard]] _CCCL_API constexpr auto __highest_(__constant_sequence<_Value>) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __highest_(constant_sequence<_Value>) noexcept
 {
   return __constant_sequence_compute_highest<_Value>();
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr auto __highest_(__immediate<_Arg, _StaticBounds> __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __highest_(immediate<_Arg, _StaticBounds> __arg) noexcept
 {
   return __arg.__arg_;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr auto __highest_(__immediate_sequence<_Arg, _StaticBounds> __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __highest_(immediate_sequence<_Arg, _StaticBounds> __arg) noexcept
 {
   using _ET = __element_type_of_t<_Arg>;
   __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
@@ -855,7 +855,7 @@ template <class _Arg, class _StaticBounds>
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr auto __highest_(__deferred<_Arg, _StaticBounds> __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __highest_(deferred<_Arg, _StaticBounds> __arg) noexcept
 {
   using _ET = __element_type_of_t<_Arg>;
   __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
@@ -863,7 +863,7 @@ template <class _Arg, class _StaticBounds>
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr auto __highest_(__deferred_sequence<_Arg, _StaticBounds> __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __highest_(deferred_sequence<_Arg, _StaticBounds> __arg) noexcept
 {
   using _ET = __element_type_of_t<_Arg>;
   __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
diff --git a/libcudacxx/include/cuda/__argument/argument_bounds.h b/libcudacxx/include/cuda/__argument/argument_bounds.h
index 11c46c74417..5ccaafb4589 100644
--- a/libcudacxx/include/cuda/__argument/argument_bounds.h
+++ b/libcudacxx/include/cuda/__argument/argument_bounds.h
@@ -119,7 +119,7 @@ inline constexpr bool __is_runtime_bounds_v<__runtime_bounds<_Tp>> = true;
 //! @tparam _Upper The static upper bound.
 //! @return A compile-time bounds object.
 template <auto _Lower, auto _Upper>
-[[nodiscard]] _CCCL_API constexpr __static_bounds<_Lower, _Upper> __bounds() noexcept
+[[nodiscard]] _CCCL_API constexpr __static_bounds<_Lower, _Upper> bounds() noexcept
 {
   return {};
 }
@@ -130,7 +130,7 @@ template <auto _Lower, auto _Upper>
 //! @param __upper The runtime upper bound.
 //! @return A runtime bounds object.
 template <class _Tp>
-[[nodiscard]] _CCCL_API constexpr __runtime_bounds<_Tp> __bounds(_Tp __lower, _Tp __upper) noexcept
+[[nodiscard]] _CCCL_API constexpr __runtime_bounds<_Tp> bounds(_Tp __lower, _Tp __upper) noexcept
 {
   return {__lower, __upper};
 }
diff --git a/libcudacxx/include/cuda/argument b/libcudacxx/include/cuda/argument
new file mode 100644
index 00000000000..bcb079e7e5e
--- /dev/null
+++ b/libcudacxx/include/cuda/argument
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_ARGUMENT_
+#define _CUDA_ARGUMENT_
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__argument/argument.h>
+
+#endif // _CUDA_ARGUMENT_
diff --git a/libcudacxx/include/cuda/std/__internal/namespaces.h b/libcudacxx/include/cuda/std/__internal/namespaces.h
index 325830c17cf..f3cc191adfc 100644
--- a/libcudacxx/include/cuda/std/__internal/namespaces.h
+++ b/libcudacxx/include/cuda/std/__internal/namespaces.h
@@ -115,8 +115,8 @@
 #define _CCCL_BEGIN_NAMESPACE_CUDA_EXECUTION _CCCL_BEGIN_NAMESPACE(cuda::execution)
 #define _CCCL_END_NAMESPACE_CUDA_EXECUTION   _CCCL_END_NAMESPACE(cuda::execution)
 
-#define _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT _CCCL_BEGIN_NAMESPACE(cuda::__argument)
-#define _CCCL_END_NAMESPACE_CUDA_ARGUMENT   _CCCL_END_NAMESPACE(cuda::__argument)
+#define _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT _CCCL_BEGIN_NAMESPACE(cuda::argument)
+#define _CCCL_END_NAMESPACE_CUDA_ARGUMENT   _CCCL_END_NAMESPACE(cuda::argument)
 
 // Namespace to avoid name collisions with CPOs on clang-16 (see
 // https://godbolt.org/z/9TadonrdM for example). MSVC's ancient parser also gets confused with
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
index c03b685e821..9a5717f32c3 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
@@ -8,7 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 #include <cuda/std/cassert>
 #include <cuda/std/type_traits>
 
@@ -20,27 +20,27 @@ TEST_FUNC constexpr bool test()
 
   // Basic static bounds
   {
-    constexpr auto b = cuda::__argument::__static_bounds<1, 4096>{};
+    constexpr auto b = cuda::argument::__static_bounds<1, 4096>{};
     static_assert(b.lower() == 1);
     static_assert(b.upper() == 4096);
   }
 
   // Exact static bounds
   {
-    constexpr auto b = cuda::__argument::__static_bounds<42, 42>{};
+    constexpr auto b = cuda::argument::__static_bounds<42, 42>{};
     static_assert(b.lower() == 42);
     static_assert(b.upper() == 42);
   }
 
   // Long type deduced from NTTPs
   {
-    static_assert(cuda::std::is_same_v<decltype(cuda::__argument::__static_bounds<0L, 1000L>::lower()), long>);
+    static_assert(cuda::std::is_same_v<decltype(cuda::argument::__static_bounds<0L, 1000L>::lower()), long>);
   }
 
 #if TEST_HAS_CLASS_NTTP
   // Static bounds preserve their original NTTP types
   {
-    constexpr auto b = cuda::__argument::__bounds<1.0f, 8.0f>();
+    constexpr auto b = cuda::argument::bounds<1.0f, 8.0f>();
     static_assert(b.lower() == 1.0f);
     static_assert(b.upper() == 8);
     static_assert(cuda::std::is_same_v<decltype(b.lower()), float>);
@@ -52,9 +52,13 @@ TEST_FUNC constexpr bool test()
 
   // Basic runtime bounds
   {
-    auto b = cuda::__argument::__runtime_bounds{10, 100};
+    auto b = cuda::argument::__runtime_bounds{10, 100};
     assert(b.lower() == 10);
     assert(b.upper() == 100);
+    assert(b.__lower_ == 10);
+    assert(b.__upper_ == 100);
+    b.__upper_ = 90;
+    assert(b.upper() == 90);
     static_assert(cuda::std::is_same_v<decltype(b.lower()), int>);
   }
 
@@ -62,35 +66,35 @@ TEST_FUNC constexpr bool test()
 
   // Static via factory
   {
-    constexpr auto b = cuda::__argument::__bounds<1, 8>();
+    constexpr auto b = cuda::argument::bounds<1, 8>();
     static_assert(b.lower() == 1);
     static_assert(b.upper() == 8);
-    static_assert(cuda::__argument::__is_static_bounds_cv_v<decltype(b)>);
-    static_assert(!cuda::__argument::__is_runtime_bounds_cv_v<decltype(b)>);
-    static_assert(cuda::__argument::__is_bounds_v<decltype(b)>);
+    static_assert(cuda::argument::__is_static_bounds_cv_v<decltype(b)>);
+    static_assert(!cuda::argument::__is_runtime_bounds_cv_v<decltype(b)>);
+    static_assert(cuda::argument::__is_bounds_v<decltype(b)>);
   }
 
   // Runtime via factory
   {
-    auto b = cuda::__argument::__bounds(10, 100);
+    auto b = cuda::argument::bounds(10, 100);
     assert(b.lower() == 10);
     assert(b.upper() == 100);
-    static_assert(!cuda::__argument::__is_static_bounds_cv_v<decltype(b)>);
-    static_assert(cuda::__argument::__is_runtime_bounds_cv_v<decltype(b)>);
-    static_assert(cuda::__argument::__is_bounds_v<decltype(b)>);
+    static_assert(!cuda::argument::__is_static_bounds_cv_v<decltype(b)>);
+    static_assert(cuda::argument::__is_runtime_bounds_cv_v<decltype(b)>);
+    static_assert(cuda::argument::__is_bounds_v<decltype(b)>);
   }
 
   // Static and runtime bounds intersection
   {
-    static_assert(cuda::__argument::__has_bounds_intersection<int, cuda::__argument::__static_bounds<1, 100>>(
-      cuda::__argument::__runtime_bounds<int>{50, 200}));
-    static_assert(!cuda::__argument::__has_bounds_intersection<int, cuda::__argument::__static_bounds<100, 200>>(
-      cuda::__argument::__runtime_bounds<int>{0, 50}));
+    static_assert(cuda::argument::__has_bounds_intersection<int, cuda::argument::__static_bounds<1, 100>>(
+      cuda::argument::__runtime_bounds<int>{50, 200}));
+    static_assert(!cuda::argument::__has_bounds_intersection<int, cuda::argument::__static_bounds<100, 200>>(
+      cuda::argument::__runtime_bounds<int>{0, 50}));
   }
 
   // Non-bounds type
   {
-    static_assert(!cuda::__argument::__is_bounds_v<int>);
+    static_assert(!cuda::argument::__is_bounds_v<int>);
   }
 
   return true;
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp
index cdcc6665747..50a6b51c932 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp
@@ -8,7 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 #include <cuda/iterator>
 #include <cuda/std/array>
 #include <cuda/std/limits>
@@ -50,134 +50,128 @@ TEST_FUNC void test()
 {
   // --- __is_sequence_v / __is_single_value_v ---
 
-  static_assert(!cuda::__argument::__is_sequence_v<int>);
-  static_assert(!cuda::__argument::__is_sequence_v<color>);
-  static_assert(!cuda::__argument::__is_sequence_v<non_sequence_value>);
-  static_assert(cuda::__argument::__is_sequence_v<int*>);
-  static_assert(cuda::__argument::__is_sequence_v<cuda::counting_iterator<int>>);
-  static_assert(cuda::__argument::__is_sequence_v<cuda::std::span<int, 1>>);
-  static_assert(cuda::__argument::__is_sequence_v<const cuda::std::span<int, 1>&>);
-  static_assert(cuda::__argument::__is_sequence_v<cuda::std::span<int>>);
-  static_assert(cuda::__argument::__is_sequence_v<cuda::std::array<int, 3>>);
-  static_assert(cuda::__argument::__is_sequence_v<range_like<int>>);
-  static_assert(cuda::__argument::__is_sequence_v<element_type_like<int>>);
-  static_assert(cuda::__argument::__is_sequence_v<value_type_like<int>>);
-
-  static_assert(cuda::__argument::__is_single_value_v<int>);
-  static_assert(cuda::__argument::__is_single_value_v<float>);
-  static_assert(cuda::__argument::__is_single_value_v<double>);
-  static_assert(cuda::__argument::__is_single_value_v<const int>);
-  static_assert(cuda::__argument::__is_single_value_v<color>);
-  static_assert(cuda::__argument::__is_single_value_v<non_sequence_value>);
-  static_assert(!cuda::__argument::__is_single_value_v<int*>);
-  static_assert(!cuda::__argument::__is_single_value_v<cuda::counting_iterator<int>>);
-  static_assert(!cuda::__argument::__is_single_value_v<cuda::std::span<int, 1>>);
-  static_assert(!cuda::__argument::__is_single_value_v<const cuda::std::span<int, 1>&>);
-  static_assert(!cuda::__argument::__is_single_value_v<cuda::std::span<int>>);
-  static_assert(!cuda::__argument::__is_single_value_v<cuda::std::span<int, 4>>);
-  static_assert(!cuda::__argument::__is_single_value_v<cuda::std::array<int, 3>>);
+  static_assert(!cuda::argument::__is_sequence_v<int>);
+  static_assert(!cuda::argument::__is_sequence_v<color>);
+  static_assert(!cuda::argument::__is_sequence_v<non_sequence_value>);
+  static_assert(cuda::argument::__is_sequence_v<int*>);
+  static_assert(cuda::argument::__is_sequence_v<cuda::counting_iterator<int>>);
+  static_assert(cuda::argument::__is_sequence_v<cuda::std::span<int, 1>>);
+  static_assert(cuda::argument::__is_sequence_v<const cuda::std::span<int, 1>&>);
+  static_assert(cuda::argument::__is_sequence_v<cuda::std::span<int>>);
+  static_assert(cuda::argument::__is_sequence_v<cuda::std::array<int, 3>>);
+  static_assert(cuda::argument::__is_sequence_v<range_like<int>>);
+  static_assert(cuda::argument::__is_sequence_v<element_type_like<int>>);
+  static_assert(cuda::argument::__is_sequence_v<value_type_like<int>>);
+
+  static_assert(cuda::argument::__is_single_value_v<int>);
+  static_assert(cuda::argument::__is_single_value_v<float>);
+  static_assert(cuda::argument::__is_single_value_v<double>);
+  static_assert(cuda::argument::__is_single_value_v<const int>);
+  static_assert(cuda::argument::__is_single_value_v<color>);
+  static_assert(cuda::argument::__is_single_value_v<non_sequence_value>);
+  static_assert(!cuda::argument::__is_single_value_v<int*>);
+  static_assert(!cuda::argument::__is_single_value_v<cuda::counting_iterator<int>>);
+  static_assert(!cuda::argument::__is_single_value_v<cuda::std::span<int, 1>>);
+  static_assert(!cuda::argument::__is_single_value_v<const cuda::std::span<int, 1>&>);
+  static_assert(!cuda::argument::__is_single_value_v<cuda::std::span<int>>);
+  static_assert(!cuda::argument::__is_single_value_v<cuda::std::span<int, 4>>);
+  static_assert(!cuda::argument::__is_single_value_v<cuda::std::array<int, 3>>);
 
   // --- __element_type_of_t ---
 
-  static_assert(cuda::std::is_same_v<cuda::__argument::__element_type_of_t<const cuda::std::span<int, 1>&>, int>);
-  static_assert(cuda::std::is_same_v<cuda::__argument::__element_type_of_t<int*>, int>);
-  static_assert(cuda::std::is_same_v<cuda::__argument::__element_type_of_t<cuda::counting_iterator<int>>, int>);
-  static_assert(cuda::std::is_same_v<cuda::__argument::__element_type_of_t<cuda::std::array<int, 3>>, int>);
-  static_assert(cuda::std::is_same_v<cuda::__argument::__element_type_of_t<range_like<int>>, int>);
-  static_assert(cuda::std::is_same_v<cuda::__argument::__element_type_of_t<element_type_like<int>>, int>);
+  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<const cuda::std::span<int, 1>&>, int>);
+  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<int*>, int>);
+  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<cuda::counting_iterator<int>>, int>);
+  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<cuda::std::array<int, 3>>, int>);
+  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<range_like<int>>, int>);
+  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<element_type_like<int>>, int>);
   static_assert(
-    cuda::std::is_same_v<cuda::__argument::__element_type_of_t<cuda::std::mdspan<const int, cuda::std::extents<int, 1>>>,
+    cuda::std::is_same_v<cuda::argument::__element_type_of_t<cuda::std::mdspan<const int, cuda::std::extents<int, 1>>>,
                          int>);
-  static_assert(cuda::std::is_same_v<cuda::__argument::__element_type_of_t<value_type_like<int>>, int>);
+  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<value_type_like<int>>, int>);
 
   // --- argument_traits: is_deferred ---
 
-  static_assert(!cuda::__argument::__traits<int>::is_deferred);
-  static_assert(!cuda::__argument::__traits<cuda::__argument::__immediate<int>>::is_deferred);
-  static_assert(!cuda::__argument::__traits<cuda::__argument::__immediate_sequence<cuda::std::span<int>>>::is_deferred);
-  static_assert(!cuda::__argument::__traits<cuda::__argument::__constant<42>>::is_deferred);
+  static_assert(!cuda::argument::__traits<int>::is_deferred);
+  static_assert(!cuda::argument::__traits<cuda::argument::immediate<int>>::is_deferred);
+  static_assert(!cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>::is_deferred);
+  static_assert(!cuda::argument::__traits<cuda::argument::constant<42>>::is_deferred);
 #if TEST_HAS_CLASS_NTTP
   static_assert(
-    !cuda::__argument::__traits<cuda::__argument::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_deferred);
+    !cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_deferred);
 #endif // TEST_HAS_CLASS_NTTP
-  static_assert(cuda::__argument::__traits<cuda::__argument::__deferred<cuda::std::span<int, 1>>>::is_deferred);
-  static_assert(cuda::__argument::__traits<cuda::__argument::__deferred_sequence<cuda::std::span<int>>>::is_deferred);
+  static_assert(cuda::argument::__traits<cuda::argument::deferred<cuda::std::span<int, 1>>>::is_deferred);
+  static_assert(cuda::argument::__traits<cuda::argument::deferred_sequence<cuda::std::span<int>>>::is_deferred);
 
   // --- argument_traits: is_single_value ---
 
-  static_assert(cuda::__argument::__traits<int>::is_single_value);
-  static_assert(!cuda::__argument::__traits<int*>::is_single_value);
-  static_assert(cuda::__argument::__traits<cuda::__argument::__immediate<int>>::is_single_value);
-  static_assert(cuda::__argument::__traits<cuda::__argument::__immediate<int*>>::is_single_value);
-  static_assert(
-    cuda::__argument::__traits<cuda::__argument::__immediate<cuda::counting_iterator<int>>>::is_single_value);
-  static_assert(
-    !cuda::__argument::__traits<cuda::__argument::__immediate_sequence<cuda::std::span<int>>>::is_single_value);
-  static_assert(cuda::__argument::__traits<cuda::__argument::__constant<42>>::is_single_value);
+  static_assert(cuda::argument::__traits<int>::is_single_value);
+  static_assert(!cuda::argument::__traits<int*>::is_single_value);
+  static_assert(cuda::argument::__traits<cuda::argument::immediate<int>>::is_single_value);
+  static_assert(cuda::argument::__traits<cuda::argument::immediate<int*>>::is_single_value);
+  static_assert(cuda::argument::__traits<cuda::argument::immediate<cuda::counting_iterator<int>>>::is_single_value);
+  static_assert(!cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>::is_single_value);
+  static_assert(cuda::argument::__traits<cuda::argument::constant<42>>::is_single_value);
 #if TEST_HAS_CLASS_NTTP
-  static_assert(!cuda::__argument::__traits<
-                cuda::__argument::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
-#endif // TEST_HAS_CLASS_NTTP
-  static_assert(cuda::__argument::__traits<cuda::__argument::__deferred<int*>>::is_single_value);
   static_assert(
-    !cuda::__argument::__traits<cuda::__argument::__deferred_sequence<cuda::std::span<int>>>::is_single_value);
+    !cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
+#endif // TEST_HAS_CLASS_NTTP
+  static_assert(cuda::argument::__traits<cuda::argument::deferred<int*>>::is_single_value);
+  static_assert(!cuda::argument::__traits<cuda::argument::deferred_sequence<cuda::std::span<int>>>::is_single_value);
 
   // --- argument_traits: value_type ---
 
-  static_assert(cuda::std::is_same_v<cuda::__argument::__traits<int>::value_type, int>);
-  static_assert(cuda::std::is_same_v<cuda::__argument::__traits<cuda::__argument::__immediate<int>>::value_type, int>);
-  static_assert(cuda::std::is_same_v<
-                cuda::__argument::__traits<cuda::__argument::__immediate_sequence<cuda::std::span<int>>>::value_type,
-                cuda::std::span<int>>);
-  static_assert(cuda::std::is_same_v<cuda::__argument::__traits<cuda::__argument::__constant<42>>::value_type, int>);
+  static_assert(cuda::std::is_same_v<cuda::argument::__traits<int>::value_type, int>);
+  static_assert(cuda::std::is_same_v<cuda::argument::__traits<cuda::argument::immediate<int>>::value_type, int>);
+  static_assert(
+    cuda::std::is_same_v<cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>::value_type,
+                         cuda::std::span<int>>);
+  static_assert(cuda::std::is_same_v<cuda::argument::__traits<cuda::argument::constant<42>>::value_type, int>);
 #if TEST_HAS_CLASS_NTTP
   static_assert(
     cuda::std::is_same_v<
-      cuda::__argument::__traits<cuda::__argument::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::value_type,
+      cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::value_type,
       cuda::std::array<int, 3>>);
 #endif // TEST_HAS_CLASS_NTTP
 
   // --- argument_traits: lowest / highest ---
 
-  static_assert(cuda::__argument::__traits<int>::lowest == cuda::std::numeric_limits<int>::lowest());
-  static_assert(cuda::__argument::__traits<int>::highest == (cuda::std::numeric_limits<int>::max)());
-  static_assert(cuda::__argument::__traits<const int>::lowest == cuda::std::numeric_limits<int>::lowest());
-  static_assert(cuda::__argument::__traits<int&>::highest == (cuda::std::numeric_limits<int>::max)());
-  static_assert(cuda::__argument::__traits<float>::lowest == cuda::std::numeric_limits<float>::lowest());
-  static_assert(cuda::__argument::__traits<float>::highest == (cuda::std::numeric_limits<float>::max)());
+  static_assert(cuda::argument::__traits<int>::lowest == cuda::std::numeric_limits<int>::lowest());
+  static_assert(cuda::argument::__traits<int>::highest == (cuda::std::numeric_limits<int>::max)());
+  static_assert(cuda::argument::__traits<const int>::lowest == cuda::std::numeric_limits<int>::lowest());
+  static_assert(cuda::argument::__traits<int&>::highest == (cuda::std::numeric_limits<int>::max)());
+  static_assert(cuda::argument::__traits<float>::lowest == cuda::std::numeric_limits<float>::lowest());
+  static_assert(cuda::argument::__traits<float>::highest == (cuda::std::numeric_limits<float>::max)());
   static_assert(
-    cuda::__argument::__traits<const cuda::__argument::__immediate<int, cuda::__argument::__static_bounds<1, 8>>>::lowest
-    == 1);
+    cuda::argument::__traits<const cuda::argument::immediate<int, cuda::argument::__static_bounds<1, 8>>>::lowest == 1);
   static_assert(
-    cuda::__argument::__traits<cuda::__argument::__immediate<int, cuda::__argument::__static_bounds<1, 8>>&>::highest
-    == 8);
+    cuda::argument::__traits<cuda::argument::immediate<int, cuda::argument::__static_bounds<1, 8>>&>::highest == 8);
   static_assert(
-    cuda::__argument::__traits<
-      cuda::__argument::__immediate_sequence<cuda::std::span<int>, cuda::__argument::__static_bounds<1, 8>>>::highest
+    cuda::argument::__traits<
+      cuda::argument::immediate_sequence<cuda::std::span<int>, cuda::argument::__static_bounds<1, 8>>>::highest
     == 8);
 #if TEST_HAS_CLASS_NTTP
   static_assert(
-    cuda::__argument::__traits<cuda::__argument::__constant_sequence<cuda::std::array<int, 3>{3, 1, 2}>>::lowest == 1);
+    cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{3, 1, 2}>>::lowest == 1);
   static_assert(
-    cuda::__argument::__traits<cuda::__argument::__constant_sequence<cuda::std::array<int, 3>{3, 1, 2}>>::highest == 3);
+    cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{3, 1, 2}>>::highest == 3);
 #endif // TEST_HAS_CLASS_NTTP
 
   // --- Free function bounds on plain values ---
 
-  static_assert(cuda::__argument::__lowest_(42) == cuda::std::numeric_limits<int>::lowest());
-  static_assert(cuda::__argument::__highest_(42) == (cuda::std::numeric_limits<int>::max)());
-  static_assert(cuda::__argument::__lowest_(1.0f) == cuda::std::numeric_limits<float>::lowest());
-  static_assert(cuda::__argument::__highest_(1.0f) == (cuda::std::numeric_limits<float>::max)());
+  static_assert(cuda::argument::__lowest_(42) == cuda::std::numeric_limits<int>::lowest());
+  static_assert(cuda::argument::__highest_(42) == (cuda::std::numeric_limits<int>::max)());
+  static_assert(cuda::argument::__lowest_(1.0f) == cuda::std::numeric_limits<float>::lowest());
+  static_assert(cuda::argument::__highest_(1.0f) == (cuda::std::numeric_limits<float>::max)());
 
   // --- Scalar and sequence wrappers expose distinct single-value traits ---
 
-  static_assert(cuda::__argument::__traits<cuda::__argument::__constant<42>>::is_single_value);
-  static_assert(cuda::__argument::__traits<cuda::__argument::__immediate<int>>::is_single_value);
-  static_assert(
-    !cuda::__argument::__traits<cuda::__argument::__immediate_sequence<cuda::std::span<int>>>::is_single_value);
+  static_assert(cuda::argument::__traits<cuda::argument::constant<42>>::is_single_value);
+  static_assert(cuda::argument::__traits<cuda::argument::immediate<int>>::is_single_value);
+  static_assert(!cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>::is_single_value);
 #if TEST_HAS_CLASS_NTTP
-  static_assert(!cuda::__argument::__traits<
-                cuda::__argument::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
+  static_assert(
+    !cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
 #endif // TEST_HAS_CLASS_NTTP
 }
 
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp
index 9949e0013b4..f9bda950aa7 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp
@@ -8,7 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 #include <cuda/iterator>
 #include <cuda/std/cassert>
 #include <cuda/std/limits>
@@ -22,102 +22,110 @@ TEST_FUNC constexpr bool test()
   // Deferred single value via span<T, 1>
   {
     int val  = 42;
-    auto def = cuda::__argument::__deferred{cuda::std::span<int, 1>{&val, 1}};
-    assert(cuda::__argument::__unwrap(def)[0] == 42);
-    static_assert(cuda::__argument::__traits<decltype(def)>::lowest == cuda::std::numeric_limits<int>::lowest());
-    static_assert(cuda::__argument::__traits<decltype(def)>::highest == (cuda::std::numeric_limits<int>::max)());
+    auto def = cuda::argument::deferred{cuda::std::span<int, 1>{&val, 1}};
+    assert(cuda::argument::__unwrap(def)[0] == 42);
+    assert(def.__arg_[0] == 42);
+    static_assert(cuda::argument::__traits<decltype(def)>::lowest == cuda::std::numeric_limits<int>::lowest());
+    static_assert(cuda::argument::__traits<decltype(def)>::highest == (cuda::std::numeric_limits<int>::max)());
   }
 
   // Deferred single value with static bounds
   {
     int val  = 42;
-    auto def = cuda::__argument::__deferred{cuda::std::span<int, 1>{&val, 1}, cuda::__argument::__bounds<1, 1000>()};
-    assert(cuda::__argument::__unwrap(def)[0] == 42);
-    static_assert(cuda::__argument::__traits<decltype(def)>::lowest == 1);
-    static_assert(cuda::__argument::__traits<decltype(def)>::highest == 1000);
+    auto def = cuda::argument::deferred{cuda::std::span<int, 1>{&val, 1}, cuda::argument::bounds<1, 1000>()};
+    assert(cuda::argument::__unwrap(def)[0] == 42);
+    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 1);
+    static_assert(cuda::argument::__traits<decltype(def)>::highest == 1000);
   }
 
   // Deferred single value via pointer
   {
     int val     = 42;
-    using def_t = cuda::__argument::__deferred<int*, cuda::__argument::__static_bounds<0, 100>>;
-    static_assert(cuda::__argument::__traits<def_t>::lowest == 0);
-    static_assert(cuda::__argument::__traits<def_t>::highest == 100);
+    using def_t = cuda::argument::deferred<int*, cuda::argument::__static_bounds<0, 100>>;
+    static_assert(cuda::argument::__traits<def_t>::lowest == 0);
+    static_assert(cuda::argument::__traits<def_t>::highest == 100);
     // Also verify construction works
-    auto def = cuda::__argument::__deferred{&val, cuda::__argument::__bounds<0, 100>()};
-    assert(cuda::__argument::__unwrap(def) == &val);
+    auto def = cuda::argument::deferred{&val, cuda::argument::bounds<0, 100>()};
+    assert(cuda::argument::__unwrap(def) == &val);
   }
 
   // Deferred single value via fancy iterator
   {
     auto it  = cuda::counting_iterator<int>{42};
-    auto def = cuda::__argument::__deferred{it, cuda::__argument::__bounds<0, 100>()};
-    assert(cuda::__argument::__unwrap(def)[0] == 42);
-    static_assert(cuda::__argument::__traits<decltype(def)>::lowest == 0);
-    static_assert(cuda::__argument::__traits<decltype(def)>::highest == 100);
-    static_assert(cuda::__argument::__traits<decltype(def)>::is_single_value);
+    auto def = cuda::argument::deferred{it, cuda::argument::bounds<0, 100>()};
+    assert(cuda::argument::__unwrap(def)[0] == 42);
+    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 0);
+    static_assert(cuda::argument::__traits<decltype(def)>::highest == 100);
+    static_assert(cuda::argument::__traits<decltype(def)>::is_single_value);
   }
 
   // Deferred single value with both bounds, runtime bounds first
   {
     int val  = 42;
-    auto def = cuda::__argument::__deferred{
-      cuda::std::span<int, 1>{&val, 1}, cuda::__argument::__bounds(5, 100), cuda::__argument::__bounds<1, 256>()};
-    static_assert(cuda::__argument::__traits<decltype(def)>::lowest == 1);
-    static_assert(cuda::__argument::__traits<decltype(def)>::highest == 256);
-    assert(cuda::__argument::__lowest_(def) == 5);
-    assert(cuda::__argument::__highest_(def) == 100);
+    auto def = cuda::argument::deferred{
+      cuda::std::span<int, 1>{&val, 1}, cuda::argument::bounds(5, 100), cuda::argument::bounds<1, 256>()};
+    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 1);
+    static_assert(cuda::argument::__traits<decltype(def)>::highest == 256);
+    assert(def.__runtime_bounds_.__lower_ == 5);
+    assert(def.__runtime_bounds_.__upper_ == 100);
+    assert(cuda::argument::__lowest_(def) == 5);
+    assert(cuda::argument::__highest_(def) == 100);
+    def.__runtime_bounds_.__upper_ = 90;
+    assert(cuda::argument::__highest_(def) == 90);
   }
 
   // Deferred sequence via fancy iterator
   {
     auto it  = cuda::counting_iterator<int>{10};
-    auto def = cuda::__argument::__deferred_sequence{it, cuda::__argument::__bounds<0, 100>()};
-    assert(cuda::__argument::__unwrap(def)[0] == 10);
-    assert(cuda::__argument::__unwrap(def)[2] == 12);
-    static_assert(cuda::__argument::__traits<decltype(def)>::lowest == 0);
-    static_assert(cuda::__argument::__traits<decltype(def)>::highest == 100);
-    static_assert(!cuda::__argument::__traits<decltype(def)>::is_single_value);
+    auto def = cuda::argument::deferred_sequence{it, cuda::argument::bounds<0, 100>()};
+    assert(cuda::argument::__unwrap(def)[0] == 10);
+    assert(cuda::argument::__unwrap(def)[2] == 12);
+    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 0);
+    static_assert(cuda::argument::__traits<decltype(def)>::highest == 100);
+    static_assert(!cuda::argument::__traits<decltype(def)>::is_single_value);
   }
 
   // Deferred sequence with both bounds
   {
     int arr[4] = {10, 20, 30, 40};
-    auto def   = cuda::__argument::__deferred_sequence{
-      cuda::std::span<int>{arr, 4}, cuda::__argument::__bounds<1, 4096>(), cuda::__argument::__bounds(5, 100)};
-    static_assert(cuda::__argument::__traits<decltype(def)>::lowest == 1);
-    assert(cuda::__argument::__lowest_(def) == 5);
-    assert(cuda::__argument::__highest_(def) == 100);
+    auto def   = cuda::argument::deferred_sequence{
+      cuda::std::span<int>{arr, 4}, cuda::argument::bounds<1, 4096>(), cuda::argument::bounds(5, 100)};
+    assert(def.__arg_.size() == 4);
+    assert(def.__runtime_bounds_.__lower_ == 5);
+    assert(def.__runtime_bounds_.__upper_ == 100);
+    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 1);
+    assert(cuda::argument::__lowest_(def) == 5);
+    assert(cuda::argument::__highest_(def) == 100);
   }
 
   // Deferred sequence with both bounds, runtime bounds first
   {
     int arr[4] = {10, 20, 30, 40};
-    auto def   = cuda::__argument::__deferred_sequence{
-      cuda::std::span<int>{arr, 4}, cuda::__argument::__bounds(5, 100), cuda::__argument::__bounds<1, 4096>()};
-    static_assert(cuda::__argument::__traits<decltype(def)>::lowest == 1);
-    static_assert(cuda::__argument::__traits<decltype(def)>::highest == 4096);
-    assert(cuda::__argument::__lowest_(def) == 5);
-    assert(cuda::__argument::__highest_(def) == 100);
+    auto def   = cuda::argument::deferred_sequence{
+      cuda::std::span<int>{arr, 4}, cuda::argument::bounds(5, 100), cuda::argument::bounds<1, 4096>()};
+    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 1);
+    static_assert(cuda::argument::__traits<decltype(def)>::highest == 4096);
+    assert(cuda::argument::__lowest_(def) == 5);
+    assert(cuda::argument::__highest_(def) == 100);
   }
 
   // Traits: deferred is single value
   {
-    using traits = cuda::__argument::__traits<cuda::__argument::__deferred<cuda::std::span<int, 1>>>;
+    using traits = cuda::argument::__traits<cuda::argument::deferred<cuda::std::span<int, 1>>>;
     static_assert(traits::is_deferred);
     static_assert(traits::is_single_value);
   }
 
   // Traits: deferred with pointer is also single value
   {
-    using traits = cuda::__argument::__traits<cuda::__argument::__deferred<int*>>;
+    using traits = cuda::argument::__traits<cuda::argument::deferred<int*>>;
     static_assert(traits::is_deferred);
     static_assert(traits::is_single_value);
   }
 
   // Traits: deferred_sequence is not single value
   {
-    using traits = cuda::__argument::__traits<cuda::__argument::__deferred_sequence<cuda::std::span<int>>>;
+    using traits = cuda::argument::__traits<cuda::argument::deferred_sequence<cuda::std::span<int>>>;
     static_assert(traits::is_deferred);
     static_assert(!traits::is_single_value);
   }
@@ -125,16 +133,16 @@ TEST_FUNC constexpr bool test()
   // Unwrap: deferred
   {
     int val  = 99;
-    auto def = cuda::__argument::__deferred{cuda::std::span<int, 1>{&val, 1}};
-    auto& v  = cuda::__argument::__unwrap(def);
+    auto def = cuda::argument::deferred{cuda::std::span<int, 1>{&val, 1}};
+    auto& v  = cuda::argument::__unwrap(def);
     assert(v[0] == 99);
   }
 
   // Unwrap: deferred_sequence
   {
     int arr[3]    = {10, 20, 30};
-    auto def      = cuda::__argument::__deferred_sequence{cuda::std::span<int>{arr, 3}};
-    const auto& v = cuda::__argument::__unwrap(def);
+    auto def      = cuda::argument::deferred_sequence{cuda::std::span<int>{arr, 3}};
+    const auto& v = cuda::argument::__unwrap(def);
     assert(v.size() == 3);
     assert(v[1] == 20);
   }
@@ -142,14 +150,14 @@ TEST_FUNC constexpr bool test()
   // Unwrap: rvalue deferred returns by value
   {
     int val = 99;
-    auto v  = cuda::__argument::__unwrap(cuda::__argument::__deferred{cuda::std::span<int, 1>{&val, 1}});
+    auto v  = cuda::argument::__unwrap(cuda::argument::deferred{cuda::std::span<int, 1>{&val, 1}});
     assert(v[0] == 99);
   }
 
   // Unwrap: rvalue deferred_sequence returns by value
   {
     int arr[3] = {10, 20, 30};
-    auto v     = cuda::__argument::__unwrap(cuda::__argument::__deferred_sequence{cuda::std::span<int>{arr, 3}});
+    auto v     = cuda::argument::__unwrap(cuda::argument::deferred_sequence{cuda::std::span<int>{arr, 3}});
     assert(v.size() == 3);
     assert(v[2] == 30);
   }
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp
index eac5e71aa82..23e867be616 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp
@@ -8,7 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 #include <cuda/std/cassert>
 #include <cuda/std/limits>
 #include <cuda/std/span>
@@ -25,93 +25,99 @@ TEST_FUNC constexpr bool test()
 {
   // Uniform scalar via CTAD
   {
-    auto da = cuda::__argument::__immediate{5};
-    assert(cuda::__argument::__unwrap(da) == 5);
-    static_assert(cuda::__argument::__traits<decltype(da)>::lowest == cuda::std::numeric_limits<int>::lowest());
-    static_assert(cuda::__argument::__traits<decltype(da)>::highest == (cuda::std::numeric_limits<int>::max)());
-    assert(cuda::__argument::__lowest_(da) == 5);
-    assert(cuda::__argument::__highest_(da) == 5);
+    auto da = cuda::argument::immediate{5};
+    assert(cuda::argument::__unwrap(da) == 5);
+    assert(da.__arg_ == 5);
+    static_assert(cuda::argument::__traits<decltype(da)>::lowest == cuda::std::numeric_limits<int>::lowest());
+    static_assert(cuda::argument::__traits<decltype(da)>::highest == (cuda::std::numeric_limits<int>::max)());
+    assert(cuda::argument::__lowest_(da) == 5);
+    assert(cuda::argument::__highest_(da) == 5);
+    da.__arg_ = 6;
+    assert(cuda::argument::__unwrap(da) == 6);
   }
 
   // Uniform scalar with static bounds
   {
-    auto da = cuda::__argument::__immediate{5, cuda::__argument::__bounds<1, 8>()};
-    assert(cuda::__argument::__unwrap(da) == 5);
-    static_assert(cuda::__argument::__traits<decltype(da)>::lowest == 1);
-    static_assert(cuda::__argument::__traits<decltype(da)>::highest == 8);
-    assert(cuda::__argument::__lowest_(da) == 5);
-    assert(cuda::__argument::__highest_(da) == 5);
+    auto da = cuda::argument::immediate{5, cuda::argument::bounds<1, 8>()};
+    assert(cuda::argument::__unwrap(da) == 5);
+    static_assert(cuda::argument::__traits<decltype(da)>::lowest == 1);
+    static_assert(cuda::argument::__traits<decltype(da)>::highest == 8);
+    assert(cuda::argument::__lowest_(da) == 5);
+    assert(cuda::argument::__highest_(da) == 5);
   }
 
   // Non-sequence values are accepted without scalar-only restrictions
   {
-    auto da = cuda::__argument::__immediate{non_sequence_value{7}};
-    assert(cuda::__argument::__unwrap(da).payload == 7);
+    auto da = cuda::argument::immediate{non_sequence_value{7}};
+    assert(cuda::argument::__unwrap(da).payload == 7);
   }
 
   // Pointer-like types can still represent a single value when explicitly wrapped that way
   {
     int value = 11;
-    auto da   = cuda::__argument::__immediate{&value};
-    static_assert(cuda::__argument::__traits<decltype(da)>::is_single_value);
-    assert(*cuda::__argument::__unwrap(da) == 11);
+    auto da   = cuda::argument::immediate{&value};
+    static_assert(cuda::argument::__traits<decltype(da)>::is_single_value);
+    assert(*cuda::argument::__unwrap(da) == 11);
   }
 
   // Per-segment span with runtime bounds
   {
     int arr[4] = {10, 20, 30, 40};
-    auto da =
-      cuda::__argument::__immediate_sequence{cuda::std::span<int>{arr, 4}, cuda::__argument::__bounds(1L, 100L)};
-    assert(cuda::__argument::__unwrap(da).size() == 4);
-    assert(cuda::__argument::__lowest_(da) == 1);
-    assert(cuda::__argument::__highest_(da) == 100);
+    auto da    = cuda::argument::immediate_sequence{cuda::std::span<int>{arr, 4}, cuda::argument::bounds(1L, 100L)};
+    assert(cuda::argument::__unwrap(da).size() == 4);
+    assert(da.__arg_.size() == 4);
+    assert(da.__runtime_bounds_.__lower_ == 1);
+    assert(da.__runtime_bounds_.__upper_ == 100);
+    assert(cuda::argument::__lowest_(da) == 1);
+    assert(cuda::argument::__highest_(da) == 100);
+    da.__runtime_bounds_.__upper_ = 90;
+    assert(cuda::argument::__highest_(da) == 90);
   }
 
   // Per-segment span with both bounds
   {
     int arr[4] = {10, 20, 30, 40};
-    auto da    = cuda::__argument::__immediate_sequence{
-      cuda::std::span<int>{arr, 4}, cuda::__argument::__bounds<1, 256>(), cuda::__argument::__bounds(10, 200)};
-    static_assert(cuda::__argument::__traits<decltype(da)>::lowest == 1);
-    static_assert(cuda::__argument::__traits<decltype(da)>::highest == 256);
-    assert(cuda::__argument::__lowest_(da) == 10);
-    assert(cuda::__argument::__highest_(da) == 200);
+    auto da    = cuda::argument::immediate_sequence{
+      cuda::std::span<int>{arr, 4}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(10, 200)};
+    static_assert(cuda::argument::__traits<decltype(da)>::lowest == 1);
+    static_assert(cuda::argument::__traits<decltype(da)>::highest == 256);
+    assert(cuda::argument::__lowest_(da) == 10);
+    assert(cuda::argument::__highest_(da) == 200);
   }
 
   // Per-segment span with both bounds, runtime bounds first
   {
     int arr[4] = {10, 20, 30, 40};
-    auto da    = cuda::__argument::__immediate_sequence{
-      cuda::std::span<int>{arr, 4}, cuda::__argument::__bounds(10, 200), cuda::__argument::__bounds<1, 256>()};
-    static_assert(cuda::__argument::__traits<decltype(da)>::lowest == 1);
-    static_assert(cuda::__argument::__traits<decltype(da)>::highest == 256);
-    assert(cuda::__argument::__lowest_(da) == 10);
-    assert(cuda::__argument::__highest_(da) == 200);
+    auto da    = cuda::argument::immediate_sequence{
+      cuda::std::span<int>{arr, 4}, cuda::argument::bounds(10, 200), cuda::argument::bounds<1, 256>()};
+    static_assert(cuda::argument::__traits<decltype(da)>::lowest == 1);
+    static_assert(cuda::argument::__traits<decltype(da)>::highest == 256);
+    assert(cuda::argument::__lowest_(da) == 10);
+    assert(cuda::argument::__highest_(da) == 200);
   }
 
   // Per-segment via span
   {
     int arr[4] = {1, 2, 3, 4};
-    auto da    = cuda::__argument::__immediate_sequence{cuda::std::span<int>{arr, 4}};
-    assert(cuda::__argument::__unwrap(da).size() == 4);
-    assert(cuda::__argument::__unwrap(da)[0] == 1);
-    assert(cuda::__argument::__unwrap(da)[3] == 4);
+    auto da    = cuda::argument::immediate_sequence{cuda::std::span<int>{arr, 4}};
+    assert(cuda::argument::__unwrap(da).size() == 4);
+    assert(cuda::argument::__unwrap(da)[0] == 1);
+    assert(cuda::argument::__unwrap(da)[3] == 4);
   }
 
   // Per-segment with static bounds
   {
     int arr[4] = {10, 20, 30, 40};
-    auto da =
-      cuda::__argument::__immediate_sequence{cuda::std::span<int>{arr, 4}, cuda::__argument::__bounds<1, 100>()};
-    assert(cuda::__argument::__unwrap(da).size() == 4);
-    assert(cuda::__argument::__unwrap(da)[2] == 30);
-    static_assert(cuda::__argument::__traits<decltype(da)>::lowest == 1);
-    static_assert(cuda::__argument::__traits<decltype(da)>::highest == 100);
+    auto da    = cuda::argument::immediate_sequence{cuda::std::span<int>{arr, 4}, cuda::argument::bounds<1, 100>()};
+    assert(cuda::argument::__unwrap(da).size() == 4);
+    assert(cuda::argument::__unwrap(da)[2] == 30);
+    static_assert(cuda::argument::__traits<decltype(da)>::lowest == 1);
+    static_assert(cuda::argument::__traits<decltype(da)>::highest == 100);
   }
 
   // Traits
   {
-    using traits = cuda::__argument::__traits<cuda::__argument::__immediate<int>>;
+    using traits = cuda::argument::__traits<cuda::argument::immediate<int>>;
     static_assert(!traits::is_deferred);
     static_assert(traits::is_single_value);
     static_assert(cuda::std::is_same_v<traits::value_type, int>);
@@ -119,7 +125,7 @@ TEST_FUNC constexpr bool test()
 
   // Sequence traits
   {
-    using traits = cuda::__argument::__traits<cuda::__argument::__immediate_sequence<cuda::std::span<int>>>;
+    using traits = cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>;
     static_assert(!traits::is_deferred);
     static_assert(!traits::is_single_value);
     static_assert(cuda::std::is_same_v<traits::value_type, cuda::std::span<int>>);
@@ -128,39 +134,38 @@ TEST_FUNC constexpr bool test()
   // __is_single_value_v on unwrapped types
   {
     static_assert(
-      cuda::__argument::__is_single_value_v<cuda::__argument::__traits<cuda::__argument::__immediate<int>>::value_type>);
-    static_assert(
-      !cuda::__argument::__traits<cuda::__argument::__immediate_sequence<cuda::std::span<int>>>::is_single_value);
+      cuda::argument::__is_single_value_v<cuda::argument::__traits<cuda::argument::immediate<int>>::value_type>);
+    static_assert(!cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>::is_single_value);
   }
 
   // Unwrap: scalar
   {
-    auto da = cuda::__argument::__immediate{7};
-    auto& v = cuda::__argument::__unwrap(da);
+    auto da = cuda::argument::immediate{7};
+    auto& v = cuda::argument::__unwrap(da);
     assert(v == 7);
     v = 8;
-    assert(cuda::__argument::__unwrap(da) == 8);
+    assert(cuda::argument::__unwrap(da) == 8);
   }
 
   // Unwrap: span
   {
     int arr[3]    = {10, 20, 30};
-    auto da       = cuda::__argument::__immediate_sequence{cuda::std::span<int>{arr, 3}};
-    const auto& v = cuda::__argument::__unwrap(da);
+    auto da       = cuda::argument::immediate_sequence{cuda::std::span<int>{arr, 3}};
+    const auto& v = cuda::argument::__unwrap(da);
     assert(v.size() == 3);
     assert(v[1] == 20);
   }
 
   // Unwrap: rvalue scalar returns by value
   {
-    const auto& v = cuda::__argument::__unwrap(cuda::__argument::__immediate{7});
+    const auto& v = cuda::argument::__unwrap(cuda::argument::immediate{7});
     assert(v == 7);
   }
 
   // Unwrap: rvalue span returns by value
   {
     int arr[3] = {10, 20, 30};
-    auto v     = cuda::__argument::__unwrap(cuda::__argument::__immediate_sequence{cuda::std::span<int>{arr, 3}});
+    auto v     = cuda::argument::__unwrap(cuda::argument::immediate_sequence{cuda::std::span<int>{arr, 3}});
     assert(v.size() == 3);
     assert(v[2] == 30);
   }
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp
index 7d40183070b..485a17fe72b 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp
@@ -8,7 +8,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 #include <cuda/std/array>
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
@@ -24,37 +24,37 @@ TEST_FUNC void test()
 {
   // Basic value
   {
-    constexpr auto sa = cuda::__argument::__constant<42>{};
+    constexpr auto sa = cuda::argument::constant<42>{};
     static_assert(sa.value() == 42);
     static_assert(cuda::std::is_same_v<decltype(sa)::value_type, int>);
   }
 
   // Different types
   {
-    constexpr auto sa_long = cuda::__argument::__constant<100L>{};
+    constexpr auto sa_long = cuda::argument::constant<100L>{};
     static_assert(sa_long.value() == 100L);
     static_assert(cuda::std::is_same_v<decltype(sa_long)::value_type, long>);
   }
 
   // Negative value
   {
-    constexpr auto sa_neg = cuda::__argument::__constant<-1>{};
+    constexpr auto sa_neg = cuda::argument::constant<-1>{};
     static_assert(sa_neg.value() == -1);
   }
 
 #if TEST_HAS_CLASS_NTTP
   // Non-sequence values are accepted without scalar-only restrictions
   {
-    constexpr auto sa = cuda::__argument::__constant<non_sequence_value{7}>{};
+    constexpr auto sa = cuda::argument::constant<non_sequence_value{7}>{};
     static_assert(sa.value().payload == 7);
-    static_assert(cuda::__argument::__unwrap(sa).payload == 7);
+    static_assert(cuda::argument::__unwrap(sa).payload == 7);
   }
 #endif // TEST_HAS_CLASS_NTTP
 
 #if TEST_HAS_CLASS_NTTP
   // Array sequence
   {
-    constexpr auto sa_arr = cuda::__argument::__constant_sequence<cuda::std::array<int, 3>{128, 256, 512}>{};
+    constexpr auto sa_arr = cuda::argument::constant_sequence<cuda::std::array<int, 3>{128, 256, 512}>{};
     static_assert(sa_arr.value()[0] == 128);
     static_assert(sa_arr.value()[1] == 256);
     static_assert(sa_arr.value()[2] == 512);
@@ -64,32 +64,32 @@ TEST_FUNC void test()
 
   // Bounds: scalar
   {
-    constexpr auto sa = cuda::__argument::__constant<42>{};
-    static_assert(cuda::__argument::__lowest_(sa) == 42);
-    static_assert(cuda::__argument::__highest_(sa) == 42);
+    constexpr auto sa = cuda::argument::constant<42>{};
+    static_assert(cuda::argument::__lowest_(sa) == 42);
+    static_assert(cuda::argument::__highest_(sa) == 42);
   }
 
 #if TEST_HAS_CLASS_NTTP
   // Bounds: array sequence computes lowest/highest of elements
   {
-    constexpr auto sa = cuda::__argument::__constant_sequence<cuda::std::array<int, 3>{128, 256, 512}>{};
-    static_assert(cuda::__argument::__lowest_(sa) == 128);
-    static_assert(cuda::__argument::__highest_(sa) == 512);
+    constexpr auto sa = cuda::argument::constant_sequence<cuda::std::array<int, 3>{128, 256, 512}>{};
+    static_assert(cuda::argument::__lowest_(sa) == 128);
+    static_assert(cuda::argument::__highest_(sa) == 512);
   }
 #endif // TEST_HAS_CLASS_NTTP
 
 #if TEST_HAS_CLASS_NTTP
   // Bounds: empty array sequence has unconstrained element bounds
   {
-    constexpr auto sa = cuda::__argument::__constant_sequence<cuda::std::array<int, 0>{}>{};
-    static_assert(cuda::__argument::__lowest_(sa) == cuda::std::numeric_limits<int>::lowest());
-    static_assert(cuda::__argument::__highest_(sa) == (cuda::std::numeric_limits<int>::max)());
+    constexpr auto sa = cuda::argument::constant_sequence<cuda::std::array<int, 0>{}>{};
+    static_assert(cuda::argument::__lowest_(sa) == cuda::std::numeric_limits<int>::lowest());
+    static_assert(cuda::argument::__highest_(sa) == (cuda::std::numeric_limits<int>::max)());
   }
 #endif // TEST_HAS_CLASS_NTTP
 
   // Traits
   {
-    using traits = cuda::__argument::__traits<cuda::__argument::__constant<42>>;
+    using traits = cuda::argument::__traits<cuda::argument::constant<42>>;
     static_assert(!traits::is_deferred);
     static_assert(traits::is_constant);
     static_assert(traits::is_single_value);
@@ -101,7 +101,7 @@ TEST_FUNC void test()
 #if TEST_HAS_CLASS_NTTP
   // Sequence traits
   {
-    using traits = cuda::__argument::__traits<cuda::__argument::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>;
+    using traits = cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>;
     static_assert(traits::is_constant);
     static_assert(!traits::is_deferred);
     static_assert(!traits::is_single_value);
@@ -113,25 +113,25 @@ TEST_FUNC void test()
   // Single value: scalar is single, sequence is not
   {
     static_assert(
-      cuda::__argument::__is_single_value_v<cuda::__argument::__traits<cuda::__argument::__constant<42>>::value_type>);
+      cuda::argument::__is_single_value_v<cuda::argument::__traits<cuda::argument::constant<42>>::value_type>);
 #if TEST_HAS_CLASS_NTTP
-    static_assert(!cuda::__argument::__traits<
-                  cuda::__argument::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
+    static_assert(
+      !cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
 #endif // TEST_HAS_CLASS_NTTP
   }
 
   // Unwrap: scalar
   {
-    constexpr auto sa  = cuda::__argument::__constant<42>{};
-    constexpr auto val = cuda::__argument::__unwrap(sa);
+    constexpr auto sa  = cuda::argument::constant<42>{};
+    constexpr auto val = cuda::argument::__unwrap(sa);
     static_assert(val == 42);
   }
 
 #if TEST_HAS_CLASS_NTTP
   // Unwrap: sequence
   {
-    constexpr auto sa  = cuda::__argument::__constant_sequence<cuda::std::array<int, 3>{10, 20, 30}>{};
-    constexpr auto val = cuda::__argument::__unwrap(sa);
+    constexpr auto sa  = cuda::argument::constant_sequence<cuda::std::array<int, 3>{10, 20, 30}>{};
+    constexpr auto val = cuda::argument::__unwrap(sa);
     static_assert(val[0] == 10);
     static_assert(val[2] == 30);
   }
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp
index 19e475ef453..79a96a3366e 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp
@@ -8,11 +8,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 
-using arg_t = cuda::__argument::__immediate<unsigned char, cuda::__argument::__static_bounds<0, 1000>>;
+using arg_t = cuda::argument::immediate<unsigned char, cuda::argument::__static_bounds<0, 1000>>;
 
-[[maybe_unused]] constexpr auto invalid_highest = cuda::__argument::__traits<arg_t>::highest;
+[[maybe_unused]] constexpr auto invalid_highest = cuda::argument::__traits<arg_t>::highest;
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp
index 8d240e0cf3e..d3a0499fc55 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp
@@ -8,9 +8,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 
-[[maybe_unused]] constexpr auto invalid_bounds = cuda::__argument::__static_bounds<0, 1L>{};
+[[maybe_unused]] constexpr auto invalid_bounds = cuda::argument::__static_bounds<0, 1L>{};
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp
index e3a752b780a..f449975cb06 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp
@@ -13,7 +13,7 @@
 // All argument types (plain values, static, dynamic, deferred) work uniformly
 // through the free functions.
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 #include <cuda/std/algorithm>
 #include <cuda/std/array>
 #include <cuda/std/cassert>
@@ -34,7 +34,7 @@ enum class algorithm_variant
 template <class _SegSizeArg>
 TEST_FUNC constexpr algorithm_variant select_variant(_SegSizeArg)
 {
-  if constexpr (cuda::__argument::__traits<_SegSizeArg>::highest <= shared_memory_capacity)
+  if constexpr (cuda::argument::__traits<_SegSizeArg>::highest <= shared_memory_capacity)
   {
     return algorithm_variant::shared_memory;
   }
@@ -48,7 +48,7 @@ TEST_FUNC constexpr algorithm_variant select_variant(_SegSizeArg)
 template <class _SegSizeArg>
 TEST_FUNC constexpr int compute_buffer_size(_SegSizeArg __seg_size, int __num_segments)
 {
-  auto __highest = cuda::std::min(default_max_segment_size, static_cast<int>(cuda::__argument::__highest_(__seg_size)));
+  auto __highest = cuda::std::min(default_max_segment_size, static_cast<int>(cuda::argument::__highest_(__seg_size)));
   return __highest * __num_segments;
 }
 
@@ -56,9 +56,9 @@ TEST_FUNC constexpr int compute_buffer_size(_SegSizeArg __seg_size, int __num_se
 template <class _SegSizeArg>
 TEST_FUNC constexpr int process_segments(_SegSizeArg __seg_size)
 {
-  const auto& __val = cuda::__argument::__unwrap(__seg_size);
+  const auto& __val = cuda::argument::__unwrap(__seg_size);
 
-  if constexpr (cuda::__argument::__traits<_SegSizeArg>::is_single_value)
+  if constexpr (cuda::argument::__traits<_SegSizeArg>::is_single_value)
   {
     return static_cast<int>(__val);
   }
@@ -93,7 +93,7 @@ TEST_FUNC constexpr bool test()
 
   // static_argument: scalar, fits in shared memory, buffer = value
   {
-    constexpr auto seg_size = cuda::__argument::__constant<128>{};
+    constexpr auto seg_size = cuda::argument::constant<128>{};
     static_assert(select_variant(seg_size) == algorithm_variant::shared_memory);
     assert(compute_buffer_size(seg_size, 4) == 128 * 4);
     assert(process_segments(seg_size) == 128);
@@ -102,7 +102,7 @@ TEST_FUNC constexpr bool test()
 #if TEST_HAS_CLASS_NTTP
   // static_argument: array sequence, highest fits in shared memory
   {
-    constexpr auto seg_sizes = cuda::__argument::__constant_sequence<cuda::std::array{64, 128, 256}>{};
+    constexpr auto seg_sizes = cuda::argument::constant_sequence<cuda::std::array{64, 128, 256}>{};
     static_assert(select_variant(seg_sizes) == algorithm_variant::shared_memory);
     assert(compute_buffer_size(seg_sizes, 3) == 256 * 3);
     assert(process_segments(seg_sizes) == 64 + 128 + 256);
@@ -110,7 +110,7 @@ TEST_FUNC constexpr bool test()
 
   // static_argument: array sequence, highest exceeds shared memory, buffer clamped
   {
-    constexpr auto seg_sizes = cuda::__argument::__constant_sequence<cuda::std::array{64, 128, 512}>{};
+    constexpr auto seg_sizes = cuda::argument::constant_sequence<cuda::std::array{64, 128, 512}>{};
     static_assert(select_variant(seg_sizes) == algorithm_variant::global_memory);
     assert(compute_buffer_size(seg_sizes, 3) == 512 * 3);
     assert(process_segments(seg_sizes) == 64 + 128 + 512);
@@ -119,7 +119,7 @@ TEST_FUNC constexpr bool test()
 
   // dynamic_argument: tight static bounds, shared memory, buffer = value
   {
-    constexpr auto seg_size = cuda::__argument::__immediate{100, cuda::__argument::__bounds<1, 256>()};
+    constexpr auto seg_size = cuda::argument::immediate{100, cuda::argument::bounds<1, 256>()};
     static_assert(select_variant(seg_size) == algorithm_variant::shared_memory);
     assert(compute_buffer_size(seg_size, 4) == 100 * 4);
     assert(process_segments(seg_size) == 100);
@@ -127,7 +127,7 @@ TEST_FUNC constexpr bool test()
 
   // dynamic_argument: wide static bounds, global memory, buffer = value
   {
-    constexpr auto seg_size = cuda::__argument::__immediate{100, cuda::__argument::__bounds<1, 4096>()};
+    constexpr auto seg_size = cuda::argument::immediate{100, cuda::argument::bounds<1, 4096>()};
     static_assert(select_variant(seg_size) == algorithm_variant::global_memory);
     assert(compute_buffer_size(seg_size, 4) == 100 * 4);
     assert(process_segments(seg_size) == 100);
@@ -135,7 +135,7 @@ TEST_FUNC constexpr bool test()
 
   // dynamic_argument: no bounds, global memory, buffer = value
   {
-    constexpr auto seg_size = cuda::__argument::__immediate{100};
+    constexpr auto seg_size = cuda::argument::immediate{100};
     static_assert(select_variant(seg_size) == algorithm_variant::global_memory);
     assert(compute_buffer_size(seg_size, 4) == 100 * 4);
     assert(process_segments(seg_size) == 100);
@@ -143,9 +143,8 @@ TEST_FUNC constexpr bool test()
 
   // dynamic_argument: per-segment span with runtime bounds only
   {
-    int sizes[3] = {64, 128, 96};
-    auto seg_sizes =
-      cuda::__argument::__immediate_sequence{cuda::std::span<int>{sizes, 3}, cuda::__argument::__bounds(1, 200)};
+    int sizes[3]   = {64, 128, 96};
+    auto seg_sizes = cuda::argument::immediate_sequence{cuda::std::span<int>{sizes, 3}, cuda::argument::bounds(1, 200)};
     assert(select_variant(seg_sizes) == algorithm_variant::global_memory);
     assert(compute_buffer_size(seg_sizes, 3) == 200 * 3);
     assert(process_segments(seg_sizes) == 64 + 128 + 96);
@@ -154,9 +153,9 @@ TEST_FUNC constexpr bool test()
   // dynamic_argument: per-segment span with both bounds
   {
     int sizes[3]   = {64, 128, 96};
-    auto seg_sizes = cuda::__argument::__immediate_sequence{
-      cuda::std::span<int>{sizes, 3}, cuda::__argument::__bounds<1, 256>(), cuda::__argument::__bounds(1, 200)};
-    static_assert(cuda::__argument::__traits<decltype(seg_sizes)>::highest <= shared_memory_capacity);
+    auto seg_sizes = cuda::argument::immediate_sequence{
+      cuda::std::span<int>{sizes, 3}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(1, 200)};
+    static_assert(cuda::argument::__traits<decltype(seg_sizes)>::highest <= shared_memory_capacity);
     assert(select_variant(seg_sizes) == algorithm_variant::shared_memory);
     assert(compute_buffer_size(seg_sizes, 3) == 200 * 3);
     assert(process_segments(seg_sizes) == 64 + 128 + 96);
@@ -165,9 +164,9 @@ TEST_FUNC constexpr bool test()
   // deferred_argument: uniform, bounds for decisions only
   {
     int val       = 100;
-    auto seg_size = cuda::__argument::__deferred{
-      cuda::std::span<int, 1>{&val, 1}, cuda::__argument::__bounds<1, 256>(), cuda::__argument::__bounds(1, 200)};
-    static_assert(cuda::__argument::__traits<decltype(seg_size)>::highest <= shared_memory_capacity);
+    auto seg_size = cuda::argument::deferred{
+      cuda::std::span<int, 1>{&val, 1}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(1, 200)};
+    static_assert(cuda::argument::__traits<decltype(seg_size)>::highest <= shared_memory_capacity);
     assert(select_variant(seg_size) == algorithm_variant::shared_memory);
     assert(compute_buffer_size(seg_size, 4) == 200 * 4);
   }
@@ -183,14 +182,14 @@ TEST_FUNC constexpr bool test()
 #if TEST_HAS_CLASS_NTTP
   // static_argument float (float NTTPs require C++20)
   {
-    constexpr auto seg_size = cuda::__argument::__constant<128.0f>{};
+    constexpr auto seg_size = cuda::argument::constant<128.0f>{};
     static_assert(select_variant(seg_size) == algorithm_variant::shared_memory);
     assert(process_segments(seg_size) == 128);
   }
 
   // dynamic_argument float with static bounds
   {
-    constexpr auto seg_size = cuda::__argument::__immediate{100.0f, cuda::__argument::__bounds<1.0f, 256.0f>()};
+    constexpr auto seg_size = cuda::argument::immediate{100.0f, cuda::argument::bounds<1.0f, 256.0f>()};
     static_assert(select_variant(seg_size) == algorithm_variant::shared_memory);
     assert(process_segments(seg_size) == 100);
   }

From 3210e3d096dd515e8143d1d9c18512b0cf225832 Mon Sep 17 00:00:00 2001
From: Piotr Ciolkosz <pciolkosz@nvidia.com>
Date: Wed, 3 Jun 2026 23:35:26 -0700
Subject: [PATCH 2/5] Forgot about the bounds types

---
 libcudacxx/include/cuda/__argument/argument.h | 87 +++++++++----------
 .../include/cuda/__argument/argument_bounds.h | 20 ++---
 .../cuda/argument/argument_bounds.pass.cpp    | 16 ++--
 .../cuda/argument/argument_traits.pass.cpp    |  6 +-
 .../cuda/argument/deferred_argument.pass.cpp  |  2 +-
 .../static_bounds_conversion.fail.cpp         |  2 +-
 .../static_bounds_type_mismatch.fail.cpp      |  2 +-
 7 files changed, 67 insertions(+), 68 deletions(-)

diff --git a/libcudacxx/include/cuda/__argument/argument.h b/libcudacxx/include/cuda/__argument/argument.h
index 42d66233c6e..035e0cdcf2d 100644
--- a/libcudacxx/include/cuda/__argument/argument.h
+++ b/libcudacxx/include/cuda/__argument/argument.h
@@ -174,13 +174,13 @@ template <class _ElementType, class _StaticBounds>
 inline constexpr bool __valid_static_bounds_v = true;
 
 template <class _ElementType, auto _Lowest, auto _Highest>
-inline constexpr bool __valid_static_bounds_v<_ElementType, __static_bounds<_Lowest, _Highest>> =
+inline constexpr bool __valid_static_bounds_v<_ElementType, static_bounds<_Lowest, _Highest>> =
   __static_bound_in_range<_ElementType, _Lowest>() && __static_bound_in_range<_ElementType, _Highest>();
 
 template <class _ElementType, class _StaticBounds>
 _CCCL_API constexpr _ElementType __wrapper_static_lowest() noexcept
 {
-  if constexpr (::cuda::std::is_same_v<_StaticBounds, __no_bounds>)
+  if constexpr (::cuda::std::is_same_v<_StaticBounds, no_bounds>)
   {
     return ::cuda::std::numeric_limits<_ElementType>::lowest();
   }
@@ -193,7 +193,7 @@ _CCCL_API constexpr _ElementType __wrapper_static_lowest() noexcept
 template <class _ElementType, class _StaticBounds>
 _CCCL_API constexpr _ElementType __wrapper_static_highest() noexcept
 {
-  if constexpr (::cuda::std::is_same_v<_StaticBounds, __no_bounds>)
+  if constexpr (::cuda::std::is_same_v<_StaticBounds, no_bounds>)
   {
     return (::cuda::std::numeric_limits<_ElementType>::max)();
   }
@@ -204,28 +204,28 @@ _CCCL_API constexpr _ElementType __wrapper_static_highest() noexcept
 }
 
 template <class _ElementType, class _StaticBounds>
-_CCCL_API constexpr _ElementType __effective_lowest(__runtime_bounds<_ElementType> __runtime_bounds) noexcept
+_CCCL_API constexpr _ElementType __effective_lowest(runtime_bounds<_ElementType> __runtime_bounds) noexcept
 {
   auto __static_lowest = __wrapper_static_lowest<_ElementType, _StaticBounds>();
   return __static_lowest > __runtime_bounds.lower() ? __static_lowest : __runtime_bounds.lower();
 }
 
 template <class _ElementType, class _StaticBounds>
-_CCCL_API constexpr _ElementType __effective_highest(__runtime_bounds<_ElementType> __runtime_bounds) noexcept
+_CCCL_API constexpr _ElementType __effective_highest(runtime_bounds<_ElementType> __runtime_bounds) noexcept
 {
   auto __static_highest = __wrapper_static_highest<_ElementType, _StaticBounds>();
   return __static_highest < __runtime_bounds.upper() ? __static_highest : __runtime_bounds.upper();
 }
 
 template <class _ElementType, class _StaticBounds>
-_CCCL_API constexpr bool __has_bounds_intersection(__runtime_bounds<_ElementType> __runtime_bounds) noexcept
+_CCCL_API constexpr bool __has_bounds_intersection(runtime_bounds<_ElementType> __runtime_bounds) noexcept
 {
   return __effective_lowest<_ElementType, _StaticBounds>(__runtime_bounds)
       <= __effective_highest<_ElementType, _StaticBounds>(__runtime_bounds);
 }
 
 template <class _ElementType, class _StaticBounds>
-_CCCL_API constexpr void __validate_bounds_intersection(__runtime_bounds<_ElementType> __runtime_bounds) noexcept
+_CCCL_API constexpr void __validate_bounds_intersection(runtime_bounds<_ElementType> __runtime_bounds) noexcept
 {
   static_assert(__valid_static_bounds_v<_ElementType, _StaticBounds>,
                 "static argument bounds cannot be represented by the element type");
@@ -236,7 +236,7 @@ _CCCL_API constexpr void __validate_bounds_intersection(__runtime_bounds<_Elemen
 template <class _ElementType, class _StaticBounds>
 _CCCL_API constexpr void __validate_static_element_bounds([[maybe_unused]] const _ElementType& __val) noexcept
 {
-  if constexpr (!::cuda::std::is_same_v<_StaticBounds, __no_bounds>)
+  if constexpr (!::cuda::std::is_same_v<_StaticBounds, no_bounds>)
   {
     _CCCL_ASSERT((__val >= __wrapper_static_lowest<_ElementType, _StaticBounds>()),
                  "immediate argument value is below static lowest bound");
@@ -247,7 +247,7 @@ _CCCL_API constexpr void __validate_static_element_bounds([[maybe_unused]] const
 
 template <class _ElementType>
 _CCCL_API constexpr void __validate_runtime_element_bounds(
-  [[maybe_unused]] const _ElementType& __val, [[maybe_unused]] __runtime_bounds<_ElementType> __runtime_bounds) noexcept
+  [[maybe_unused]] const _ElementType& __val, [[maybe_unused]] runtime_bounds<_ElementType> __runtime_bounds) noexcept
 {
   _CCCL_ASSERT((__val >= __runtime_bounds.lower()), "immediate argument value is below runtime lower bound");
   _CCCL_ASSERT((__val <= __runtime_bounds.upper()), "immediate argument value is above runtime upper bound");
@@ -260,7 +260,7 @@ _CCCL_API constexpr void __validate_runtime_element_bounds(
 //! @brief Wraps a runtime argument value with optional bounds.
 //!
 //! The value is host-accessible at API call time.
-template <class _Arg, class _StaticBounds = __no_bounds>
+template <class _Arg, class _StaticBounds = no_bounds>
 struct immediate
 {
   using __element_type = __element_type_of_t<_Arg>;
@@ -296,8 +296,8 @@ struct immediate
 
 #ifndef _CCCL_DOXYGEN_INVOKED
 template <class _Arg, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE immediate(_Arg, __static_bounds<_Lowest, _Highest>)
-  -> immediate<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE immediate(_Arg, static_bounds<_Lowest, _Highest>)
+  -> immediate<_Arg, static_bounds<_Lowest, _Highest>>;
 
 #endif // _CCCL_DOXYGEN_INVOKED
 
@@ -306,7 +306,7 @@ _CCCL_HOST_DEVICE immediate(_Arg, __static_bounds<_Lowest, _Highest>)
 // =====================================================================
 
 //! @brief Wraps a runtime argument sequence with optional bounds.
-template <class _Arg, class _StaticBounds = __no_bounds>
+template <class _Arg, class _StaticBounds = no_bounds>
 struct immediate_sequence
 {
   using __element_type = __element_type_of_t<_Arg>;
@@ -316,7 +316,7 @@ struct immediate_sequence
                 "static argument bounds cannot be represented by the element type");
 
   _Arg __arg_;
-  __runtime_bounds<__element_type> __runtime_bounds_{};
+  runtime_bounds<__element_type> __runtime_bounds_{};
 
 private:
   _CCCL_API constexpr void __validate_bounds() const noexcept
@@ -357,7 +357,7 @@ struct immediate_sequence
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb) noexcept
+  _CCCL_API constexpr immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb) noexcept
       : __arg_{::cuda::std::move(__arg)}
       , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()),
                           __runtime_bound_cast<__element_type>(__rb.upper())}
@@ -367,7 +367,7 @@ struct immediate_sequence
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds, __runtime_bounds<_BoundsTp> __rb) noexcept
+  _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds, runtime_bounds<_BoundsTp> __rb) noexcept
       : __arg_{::cuda::std::move(__arg)}
       , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()),
                           __runtime_bound_cast<__element_type>(__rb.upper())}
@@ -377,23 +377,23 @@ struct immediate_sequence
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr immediate_sequence(_Arg __arg, __runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept
+  _CCCL_API constexpr immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept
       : immediate_sequence(::cuda::std::move(__arg), __sb, __rb)
   {}
 };
 
 #ifndef _CCCL_DOXYGEN_INVOKED
 template <class _Arg, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>)
-  -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>)
+  -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, auto _Lowest, auto _Highest, class _Tp>
-_CCCL_HOST_DEVICE immediate_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>)
-  -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>, runtime_bounds<_Tp>)
+  -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, class _Tp, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE immediate_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>)
-  -> immediate_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE immediate_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Highest>)
+  -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
 #endif // _CCCL_DOXYGEN_INVOKED
 
 // =====================================================================
@@ -401,7 +401,7 @@ _CCCL_HOST_DEVICE immediate_sequence(_Arg, __runtime_bounds<_Tp>, __static_bound
 // =====================================================================
 
 //! @brief Common base for deferred argument wrappers.
-template <class _Arg, class _StaticBounds = __no_bounds>
+template <class _Arg, class _StaticBounds = no_bounds>
 struct __deferred_base
 {
   using __element_type = __element_type_of_t<_Arg>;
@@ -410,7 +410,7 @@ struct __deferred_base
                 "static argument bounds cannot be represented by the element type");
 
   _Arg __arg_;
-  __runtime_bounds<__element_type> __runtime_bounds_{};
+  runtime_bounds<__element_type> __runtime_bounds_{};
 
   _CCCL_API constexpr __deferred_base(_Arg __arg) noexcept
       : __arg_{::cuda::std::move(__arg)}
@@ -425,7 +425,7 @@ struct __deferred_base
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr __deferred_base(_Arg __arg, __runtime_bounds<_BoundsTp> __rb) noexcept
+  _CCCL_API constexpr __deferred_base(_Arg __arg, runtime_bounds<_BoundsTp> __rb) noexcept
       : __arg_{::cuda::std::move(__arg)}
       , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()),
                           __runtime_bound_cast<__element_type>(__rb.upper())}
@@ -434,7 +434,7 @@ struct __deferred_base
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr __deferred_base(_Arg __arg, _StaticBounds, __runtime_bounds<_BoundsTp> __rb) noexcept
+  _CCCL_API constexpr __deferred_base(_Arg __arg, _StaticBounds, runtime_bounds<_BoundsTp> __rb) noexcept
       : __arg_{::cuda::std::move(__arg)}
       , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()),
                           __runtime_bound_cast<__element_type>(__rb.upper())}
@@ -443,14 +443,14 @@ struct __deferred_base
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr __deferred_base(_Arg __arg, __runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept
+  _CCCL_API constexpr __deferred_base(_Arg __arg, runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept
       : __deferred_base(::cuda::std::move(__arg), __sb, __rb)
   {}
 };
 
 //! @brief Wraps a reference to a single value that is potentially not available at API call time but will be available
 //! by the time the argument is consumed in stream order.
-template <class _Arg, class _StaticBounds = __no_bounds>
+template <class _Arg, class _StaticBounds = no_bounds>
 struct deferred : __deferred_base<_Arg, _StaticBounds>
 {
   using __deferred_base<_Arg, _StaticBounds>::__deferred_base;
@@ -461,24 +461,23 @@ template <class _Arg>
 _CCCL_HOST_DEVICE deferred(_Arg) -> deferred<_Arg>;
 
 template <class _Arg, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE deferred(_Arg, __static_bounds<_Lowest, _Highest>)
-  -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred(_Arg, static_bounds<_Lowest, _Highest>) -> deferred<_Arg, static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, class _Tp>
-_CCCL_HOST_DEVICE deferred(_Arg, __runtime_bounds<_Tp>) -> deferred<_Arg>;
+_CCCL_HOST_DEVICE deferred(_Arg, runtime_bounds<_Tp>) -> deferred<_Arg>;
 
 template <class _Arg, auto _Lowest, auto _Highest, class _Tp>
-_CCCL_HOST_DEVICE deferred(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>)
-  -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred(_Arg, static_bounds<_Lowest, _Highest>, runtime_bounds<_Tp>)
+  -> deferred<_Arg, static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, class _Tp, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE deferred(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>)
-  -> deferred<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Highest>)
+  -> deferred<_Arg, static_bounds<_Lowest, _Highest>>;
 #endif // _CCCL_DOXYGEN_INVOKED
 
 //! @brief Wraps a reference to a sequence of values that is potentially not available at API call time but will be
 //! available by the time the argument is consumed in stream order.
-template <class _Arg, class _StaticBounds = __no_bounds>
+template <class _Arg, class _StaticBounds = no_bounds>
 struct deferred_sequence : __deferred_base<_Arg, _StaticBounds>
 {
   using __deferred_base<_Arg, _StaticBounds>::__deferred_base;
@@ -489,19 +488,19 @@ template <class _Arg>
 _CCCL_HOST_DEVICE deferred_sequence(_Arg) -> deferred_sequence<_Arg>;
 
 template <class _Arg, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>)
-  -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred_sequence(_Arg, static_bounds<_Lowest, _Highest>)
+  -> deferred_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, class _Tp>
-_CCCL_HOST_DEVICE deferred_sequence(_Arg, __runtime_bounds<_Tp>) -> deferred_sequence<_Arg>;
+_CCCL_HOST_DEVICE deferred_sequence(_Arg, runtime_bounds<_Tp>) -> deferred_sequence<_Arg>;
 
 template <class _Arg, auto _Lowest, auto _Highest, class _Tp>
-_CCCL_HOST_DEVICE deferred_sequence(_Arg, __static_bounds<_Lowest, _Highest>, __runtime_bounds<_Tp>)
-  -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred_sequence(_Arg, static_bounds<_Lowest, _Highest>, runtime_bounds<_Tp>)
+  -> deferred_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, class _Tp, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE deferred_sequence(_Arg, __runtime_bounds<_Tp>, __static_bounds<_Lowest, _Highest>)
-  -> deferred_sequence<_Arg, __static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE deferred_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Highest>)
+  -> deferred_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
 #endif // _CCCL_DOXYGEN_INVOKED
 
 // =====================================================================
diff --git a/libcudacxx/include/cuda/__argument/argument_bounds.h b/libcudacxx/include/cuda/__argument/argument_bounds.h
index 5ccaafb4589..630ae897910 100644
--- a/libcudacxx/include/cuda/__argument/argument_bounds.h
+++ b/libcudacxx/include/cuda/__argument/argument_bounds.h
@@ -31,7 +31,7 @@
 _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT
 
 //! @brief Sentinel type indicating no bounds are present.
-struct __no_bounds
+struct no_bounds
 {};
 
 // =====================================================================
@@ -45,7 +45,7 @@ struct __no_bounds
 //! @tparam _Lower The static lower bound.
 //! @tparam _Upper The static upper bound.
 template <auto _Lower, auto _Upper>
-struct __static_bounds
+struct static_bounds
 {
   static_assert(::cuda::std::is_same_v<decltype(_Lower), decltype(_Upper)>,
                 "Static bounds endpoints must have the same type");
@@ -64,7 +64,7 @@ struct __static_bounds
 template <class _Tp>
 inline constexpr bool __is_static_bounds_v = false;
 template <auto _Lower, auto _Upper>
-inline constexpr bool __is_static_bounds_v<__static_bounds<_Lower, _Upper>> = true;
+inline constexpr bool __is_static_bounds_v<static_bounds<_Lower, _Upper>> = true;
 
 // =====================================================================
 // runtime_bounds
@@ -74,14 +74,14 @@ inline constexpr bool __is_static_bounds_v<__static_bounds<_Lower, _Upper>> = tr
 //!
 //! @tparam _Tp The value type of the bounds.
 template <class _Tp>
-struct __runtime_bounds
+struct runtime_bounds
 {
   _Tp __lower_ = ::cuda::std::numeric_limits<_Tp>::lowest();
   _Tp __upper_ = (::cuda::std::numeric_limits<_Tp>::max)();
 
-  constexpr __runtime_bounds() noexcept = default;
+  constexpr runtime_bounds() noexcept = default;
 
-  _CCCL_API constexpr __runtime_bounds(_Tp __lower, _Tp __upper) noexcept
+  _CCCL_API constexpr runtime_bounds(_Tp __lower, _Tp __upper) noexcept
       : __lower_(__lower)
       , __upper_(__upper)
   {
@@ -101,13 +101,13 @@ struct __runtime_bounds
 
 #ifndef _CCCL_DOXYGEN_INVOKED
 template <class _Tp>
-_CCCL_HOST_DEVICE __runtime_bounds(_Tp, _Tp) -> __runtime_bounds<_Tp>;
+_CCCL_HOST_DEVICE runtime_bounds(_Tp, _Tp) -> runtime_bounds<_Tp>;
 #endif // _CCCL_DOXYGEN_INVOKED
 
 template <class _Tp>
 inline constexpr bool __is_runtime_bounds_v = false;
 template <class _Tp>
-inline constexpr bool __is_runtime_bounds_v<__runtime_bounds<_Tp>> = true;
+inline constexpr bool __is_runtime_bounds_v<runtime_bounds<_Tp>> = true;
 
 // =====================================================================
 // bounds — factory functions
@@ -119,7 +119,7 @@ inline constexpr bool __is_runtime_bounds_v<__runtime_bounds<_Tp>> = true;
 //! @tparam _Upper The static upper bound.
 //! @return A compile-time bounds object.
 template <auto _Lower, auto _Upper>
-[[nodiscard]] _CCCL_API constexpr __static_bounds<_Lower, _Upper> bounds() noexcept
+[[nodiscard]] _CCCL_API constexpr static_bounds<_Lower, _Upper> bounds() noexcept
 {
   return {};
 }
@@ -130,7 +130,7 @@ template <auto _Lower, auto _Upper>
 //! @param __upper The runtime upper bound.
 //! @return A runtime bounds object.
 template <class _Tp>
-[[nodiscard]] _CCCL_API constexpr __runtime_bounds<_Tp> bounds(_Tp __lower, _Tp __upper) noexcept
+[[nodiscard]] _CCCL_API constexpr runtime_bounds<_Tp> bounds(_Tp __lower, _Tp __upper) noexcept
 {
   return {__lower, __upper};
 }
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
index 9a5717f32c3..4275c4aca9c 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
@@ -20,21 +20,21 @@ TEST_FUNC constexpr bool test()
 
   // Basic static bounds
   {
-    constexpr auto b = cuda::argument::__static_bounds<1, 4096>{};
+    constexpr auto b = cuda::argument::static_bounds<1, 4096>{};
     static_assert(b.lower() == 1);
     static_assert(b.upper() == 4096);
   }
 
   // Exact static bounds
   {
-    constexpr auto b = cuda::argument::__static_bounds<42, 42>{};
+    constexpr auto b = cuda::argument::static_bounds<42, 42>{};
     static_assert(b.lower() == 42);
     static_assert(b.upper() == 42);
   }
 
   // Long type deduced from NTTPs
   {
-    static_assert(cuda::std::is_same_v<decltype(cuda::argument::__static_bounds<0L, 1000L>::lower()), long>);
+    static_assert(cuda::std::is_same_v<decltype(cuda::argument::static_bounds<0L, 1000L>::lower()), long>);
   }
 
 #if TEST_HAS_CLASS_NTTP
@@ -52,7 +52,7 @@ TEST_FUNC constexpr bool test()
 
   // Basic runtime bounds
   {
-    auto b = cuda::argument::__runtime_bounds{10, 100};
+    auto b = cuda::argument::runtime_bounds{10, 100};
     assert(b.lower() == 10);
     assert(b.upper() == 100);
     assert(b.__lower_ == 10);
@@ -86,10 +86,10 @@ TEST_FUNC constexpr bool test()
 
   // Static and runtime bounds intersection
   {
-    static_assert(cuda::argument::__has_bounds_intersection<int, cuda::argument::__static_bounds<1, 100>>(
-      cuda::argument::__runtime_bounds<int>{50, 200}));
-    static_assert(!cuda::argument::__has_bounds_intersection<int, cuda::argument::__static_bounds<100, 200>>(
-      cuda::argument::__runtime_bounds<int>{0, 50}));
+    static_assert(cuda::argument::__has_bounds_intersection<int, cuda::argument::static_bounds<1, 100>>(
+      cuda::argument::runtime_bounds<int>{50, 200}));
+    static_assert(!cuda::argument::__has_bounds_intersection<int, cuda::argument::static_bounds<100, 200>>(
+      cuda::argument::runtime_bounds<int>{0, 50}));
   }
 
   // Non-bounds type
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp
index 50a6b51c932..632fe50945a 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp
@@ -143,12 +143,12 @@ TEST_FUNC void test()
   static_assert(cuda::argument::__traits<float>::lowest == cuda::std::numeric_limits<float>::lowest());
   static_assert(cuda::argument::__traits<float>::highest == (cuda::std::numeric_limits<float>::max)());
   static_assert(
-    cuda::argument::__traits<const cuda::argument::immediate<int, cuda::argument::__static_bounds<1, 8>>>::lowest == 1);
+    cuda::argument::__traits<const cuda::argument::immediate<int, cuda::argument::static_bounds<1, 8>>>::lowest == 1);
   static_assert(
-    cuda::argument::__traits<cuda::argument::immediate<int, cuda::argument::__static_bounds<1, 8>>&>::highest == 8);
+    cuda::argument::__traits<cuda::argument::immediate<int, cuda::argument::static_bounds<1, 8>>&>::highest == 8);
   static_assert(
     cuda::argument::__traits<
-      cuda::argument::immediate_sequence<cuda::std::span<int>, cuda::argument::__static_bounds<1, 8>>>::highest
+      cuda::argument::immediate_sequence<cuda::std::span<int>, cuda::argument::static_bounds<1, 8>>>::highest
     == 8);
 #if TEST_HAS_CLASS_NTTP
   static_assert(
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp
index f9bda950aa7..5dc2fb849c7 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp
@@ -41,7 +41,7 @@ TEST_FUNC constexpr bool test()
   // Deferred single value via pointer
   {
     int val     = 42;
-    using def_t = cuda::argument::deferred<int*, cuda::argument::__static_bounds<0, 100>>;
+    using def_t = cuda::argument::deferred<int*, cuda::argument::static_bounds<0, 100>>;
     static_assert(cuda::argument::__traits<def_t>::lowest == 0);
     static_assert(cuda::argument::__traits<def_t>::highest == 100);
     // Also verify construction works
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp
index 79a96a3366e..7f9902e50a2 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp
@@ -10,7 +10,7 @@
 
 #include <cuda/argument>
 
-using arg_t = cuda::argument::immediate<unsigned char, cuda::argument::__static_bounds<0, 1000>>;
+using arg_t = cuda::argument::immediate<unsigned char, cuda::argument::static_bounds<0, 1000>>;
 
 [[maybe_unused]] constexpr auto invalid_highest = cuda::argument::__traits<arg_t>::highest;
 
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp
index d3a0499fc55..8170fdd7ee5 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp
@@ -10,7 +10,7 @@
 
 #include <cuda/argument>
 
-[[maybe_unused]] constexpr auto invalid_bounds = cuda::argument::__static_bounds<0, 1L>{};
+[[maybe_unused]] constexpr auto invalid_bounds = cuda::argument::static_bounds<0, 1L>{};
 
 int main(int, char**)
 {

From 2e084a921827f40709755e4e57084f8d8b7a3f80 Mon Sep 17 00:00:00 2001
From: Piotr Ciolkosz <pciolkosz@nvidia.com>
Date: Thu, 4 Jun 2026 14:04:59 -0700
Subject: [PATCH 3/5] Tighten bounds validation

---
 libcudacxx/include/cuda/__argument/argument.h | 29 +++++++++++++------
 .../cuda/argument/argument_bounds.pass.cpp    |  8 +++++
 .../static_argument_bounds_type.fail.cpp      | 20 +++++++++++++
 3 files changed, 48 insertions(+), 9 deletions(-)
 create mode 100644 libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp

diff --git a/libcudacxx/include/cuda/__argument/argument.h b/libcudacxx/include/cuda/__argument/argument.h
index 035e0cdcf2d..b955e1f428f 100644
--- a/libcudacxx/include/cuda/__argument/argument.h
+++ b/libcudacxx/include/cuda/__argument/argument.h
@@ -171,7 +171,10 @@ _CCCL_API constexpr bool __static_bound_in_range() noexcept
 }
 
 template <class _ElementType, class _StaticBounds>
-inline constexpr bool __valid_static_bounds_v = true;
+inline constexpr bool __valid_static_bounds_v = false;
+
+template <class _ElementType>
+inline constexpr bool __valid_static_bounds_v<_ElementType, no_bounds> = true;
 
 template <class _ElementType, auto _Lowest, auto _Highest>
 inline constexpr bool __valid_static_bounds_v<_ElementType, static_bounds<_Lowest, _Highest>> =
@@ -228,7 +231,8 @@ template <class _ElementType, class _StaticBounds>
 _CCCL_API constexpr void __validate_bounds_intersection(runtime_bounds<_ElementType> __runtime_bounds) noexcept
 {
   static_assert(__valid_static_bounds_v<_ElementType, _StaticBounds>,
-                "static argument bounds cannot be represented by the element type");
+                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "values representable by the element type");
   _CCCL_VERIFY((__has_bounds_intersection<_ElementType, _StaticBounds>(__runtime_bounds)),
                "static and runtime argument bounds do not intersect");
 }
@@ -266,7 +270,8 @@ struct immediate
   using __element_type = __element_type_of_t<_Arg>;
 
   static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>,
-                "static argument bounds cannot be represented by the element type");
+                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "values representable by the element type");
 
   _Arg __arg_;
 
@@ -313,7 +318,8 @@ struct immediate_sequence
 
   static_assert(__is_sequence_v<_Arg>, "immediate sequence arguments must have a distinct element type");
   static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>,
-                "static argument bounds cannot be represented by the element type");
+                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "values representable by the element type");
 
   _Arg __arg_;
   runtime_bounds<__element_type> __runtime_bounds_{};
@@ -407,7 +413,8 @@ struct __deferred_base
   using __element_type = __element_type_of_t<_Arg>;
 
   static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>,
-                "static argument bounds cannot be represented by the element type");
+                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "values representable by the element type");
 
   _Arg __arg_;
   runtime_bounds<__element_type> __runtime_bounds_{};
@@ -693,7 +700,8 @@ struct __traits_impl<immediate<_Arg, _StaticBounds>>
   using value_type   = _Arg;
   using element_type = __element_type_of_t<_Arg>;
   static_assert(__valid_static_bounds_v<element_type, _StaticBounds>,
-                "static argument bounds cannot be represented by the element type");
+                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "values representable by the element type");
 
   static constexpr bool is_constant     = false;
   static constexpr bool is_deferred     = false;
@@ -722,7 +730,8 @@ struct __traits_impl<immediate_sequence<_Arg, _StaticBounds>>
   using element_type = __element_type_of_t<_Arg>;
   static_assert(__is_sequence_v<value_type>, "immediate sequence arguments must have a distinct element type");
   static_assert(__valid_static_bounds_v<element_type, _StaticBounds>,
-                "static argument bounds cannot be represented by the element type");
+                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "values representable by the element type");
 
   static constexpr bool is_constant     = false;
   static constexpr bool is_deferred     = false;
@@ -737,7 +746,8 @@ struct __traits_impl<deferred<_Arg, _StaticBounds>>
   using value_type   = _Arg;
   using element_type = __element_type_of_t<_Arg>;
   static_assert(__valid_static_bounds_v<element_type, _StaticBounds>,
-                "static argument bounds cannot be represented by the element type");
+                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "values representable by the element type");
 
   static constexpr bool is_constant     = false;
   static constexpr bool is_deferred     = true;
@@ -752,7 +762,8 @@ struct __traits_impl<deferred_sequence<_Arg, _StaticBounds>>
   using value_type   = _Arg;
   using element_type = __element_type_of_t<_Arg>;
   static_assert(__valid_static_bounds_v<element_type, _StaticBounds>,
-                "static argument bounds cannot be represented by the element type");
+                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "values representable by the element type");
 
   static constexpr bool is_constant     = false;
   static constexpr bool is_deferred     = true;
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
index 4275c4aca9c..6be3c0d925b 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
@@ -97,6 +97,14 @@ TEST_FUNC constexpr bool test()
     static_assert(!cuda::argument::__is_bounds_v<int>);
   }
 
+  // Bounds types accepted by argument wrapper template parameters
+  {
+    static_assert(cuda::argument::__valid_static_bounds_v<int, cuda::argument::no_bounds>);
+    static_assert(cuda::argument::__valid_static_bounds_v<int, cuda::argument::static_bounds<1, 8>>);
+    static_assert(!cuda::argument::__valid_static_bounds_v<int, cuda::argument::runtime_bounds<int>>);
+    static_assert(!cuda::argument::__valid_static_bounds_v<int, int>);
+  }
+
   return true;
 }
 
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp
new file mode 100644
index 00000000000..66a1828769b
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp
@@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/argument>
+
+using arg_t = cuda::argument::immediate<int, cuda::argument::runtime_bounds<int>>;
+
+[[maybe_unused]] arg_t invalid_arg{0};
+
+int main(int, char**)
+{
+  return 0;
+}

From 2c740b7dfa2080377f6c51928e2432fe8f5e4413 Mon Sep 17 00:00:00 2001
From: Piotr Ciolkosz <pciolkosz@nvidia.com>
Date: Thu, 4 Jun 2026 14:12:53 -0700
Subject: [PATCH 4/5] Asssert deferred_sequence type is a sequence

---
 libcudacxx/include/cuda/__argument/argument.h |  3 +++
 .../deferred_sequence_scalar.fail.cpp         | 18 +++++++++++++++++
 .../deferred_sequence_scalar_traits.fail.cpp  | 20 +++++++++++++++++++
 3 files changed, 41 insertions(+)
 create mode 100644 libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp

diff --git a/libcudacxx/include/cuda/__argument/argument.h b/libcudacxx/include/cuda/__argument/argument.h
index b955e1f428f..8c1933bdc7b 100644
--- a/libcudacxx/include/cuda/__argument/argument.h
+++ b/libcudacxx/include/cuda/__argument/argument.h
@@ -487,6 +487,8 @@ _CCCL_HOST_DEVICE deferred(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Hi
 template <class _Arg, class _StaticBounds = no_bounds>
 struct deferred_sequence : __deferred_base<_Arg, _StaticBounds>
 {
+  static_assert(__is_sequence_v<_Arg>, "deferred sequence arguments must have a distinct element type");
+
   using __deferred_base<_Arg, _StaticBounds>::__deferred_base;
 };
 
@@ -761,6 +763,7 @@ struct __traits_impl<deferred_sequence<_Arg, _StaticBounds>>
 {
   using value_type   = _Arg;
   using element_type = __element_type_of_t<_Arg>;
+  static_assert(__is_sequence_v<value_type>, "deferred sequence arguments must have a distinct element type");
   static_assert(__valid_static_bounds_v<element_type, _StaticBounds>,
                 "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
                 "values representable by the element type");
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp
new file mode 100644
index 00000000000..1ce371b84e5
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp
@@ -0,0 +1,18 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/argument>
+
+[[maybe_unused]] cuda::argument::deferred_sequence<int> invalid_arg{0};
+
+int main(int, char**)
+{
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp
new file mode 100644
index 00000000000..41a9a2ae778
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp
@@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/argument>
+
+using traits = cuda::argument::__traits<cuda::argument::deferred_sequence<int>>;
+
+[[maybe_unused]] constexpr bool invalid_traits = traits::is_deferred;
+
+int main(int, char**)
+{
+  return 0;
+}

From 9088870b74443d908b05c80e381d6e129b269b7d Mon Sep 17 00:00:00 2001
From: Piotr Ciolkosz <pciolkosz@nvidia.com>
Date: Fri, 5 Jun 2026 15:20:02 -0700
Subject: [PATCH 5/5] Feedback from review meeting

---
 .../bench/segmented_topk/fixed/keys.cu        |  10 +-
 .../bench/segmented_topk/variable/keys.cu     |  12 +-
 cub/cub/agent/agent_batched_topk.cuh          |  10 +-
 cub/cub/detail/segmented_params.cuh           |  34 +-
 .../device/dispatch/dispatch_batched_topk.cuh |  18 +-
 .../dispatch/kernels/kernel_batched_topk.cuh  |   8 +-
 .../catch2_test_device_segmented_topk_keys.cu |  26 +-
 ...catch2_test_device_segmented_topk_pairs.cu |  18 +-
 libcudacxx/include/cuda/__argument/argument.h | 317 ++++++++++++------
 .../include/cuda/__argument/argument_bounds.h |   6 +-
 .../include/cuda/std/__internal/namespaces.h  |   4 +-
 .../cuda/argument/argument_bounds.pass.cpp    |  52 ++-
 .../cuda/argument/argument_traits.pass.cpp    | 165 +++++----
 .../cuda/argument/deferred_argument.pass.cpp  | 118 +++----
 .../deferred_sequence_scalar.fail.cpp         |   2 +-
 .../deferred_sequence_scalar_traits.fail.cpp  |   2 +-
 .../cuda/argument/dynamic_argument.pass.cpp   | 123 ++++---
 .../cuda/argument/static_argument.pass.cpp    |  85 +++--
 .../static_argument_bounds_type.fail.cpp      |   2 +-
 .../static_bounds_conversion.fail.cpp         |   4 +-
 .../static_bounds_type_mismatch.fail.cpp      |   2 +-
 .../cuda/argument/usage_example.pass.cpp      |  71 ++--
 22 files changed, 610 insertions(+), 479 deletions(-)

diff --git a/cub/benchmarks/bench/segmented_topk/fixed/keys.cu b/cub/benchmarks/bench/segmented_topk/fixed/keys.cu
index 4178e7ea0d5..77bd997b742 100644
--- a/cub/benchmarks/bench/segmented_topk/fixed/keys.cu
+++ b/cub/benchmarks/bench/segmented_topk/fixed/keys.cu
@@ -51,7 +51,7 @@ void fixed_seg_size_topk_keys(
   const auto selected_elements = static_cast<::cuda::std::ptrdiff_t>(MaxNumSelected);
   const auto num_segments      = ::cuda::std::max<std::size_t>(1, (max_elements / segment_size));
   const auto elements          = num_segments * segment_size;
-  const auto total_num_items   = ::cuda::argument::immediate{static_cast<::cuda::std::int64_t>(elements)};
+  const auto total_num_items   = ::cuda::args::immediate{static_cast<::cuda::std::int64_t>(elements)};
   const bit_entropy entropy    = str_to_entropy(state.get_string("Entropy"));
 
   // Skip workloads where k exceeds the segment size
@@ -68,9 +68,9 @@ void fixed_seg_size_topk_keys(
   auto d_keys_in      = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_in_ptr), segment_size);
   auto d_keys_out     = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_out_ptr), selected_elements);
 
-  auto segment_sizes    = ::cuda::argument::constant<MaxSegmentSize>{};
-  auto k                = ::cuda::argument::constant<MaxNumSelected>{};
-  auto select_direction = ::cuda::argument::constant<cub::detail::topk::select::max>{};
+  auto segment_sizes    = ::cuda::args::constant<MaxSegmentSize>{};
+  auto k                = ::cuda::args::constant<MaxNumSelected>{};
+  auto select_direction = ::cuda::args::constant<cub::detail::topk::select::max>{};
 
   state.add_element_count(elements, "NumElements");
   state.add_element_count(segment_size, "SegmentSize");
@@ -99,7 +99,7 @@ void fixed_seg_size_topk_keys(
       segment_sizes,
       k,
       select_direction,
-      ::cuda::argument::immediate{static_cast<::cuda::std::int64_t>(num_segments)},
+      ::cuda::args::immediate{static_cast<::cuda::std::int64_t>(num_segments)},
       total_num_items,
       env);
   });
diff --git a/cub/benchmarks/bench/segmented_topk/variable/keys.cu b/cub/benchmarks/bench/segmented_topk/variable/keys.cu
index d14394afc39..4f7effc912a 100644
--- a/cub/benchmarks/bench/segmented_topk/variable/keys.cu
+++ b/cub/benchmarks/bench/segmented_topk/variable/keys.cu
@@ -172,17 +172,17 @@ void variable_seg_size_topk_keys(nvbench::state& state,
     static_cast<cuda::std::int64_t>(MaxSegmentSize));
   const auto input_elements  = thrust::reduce(d_segment_sizes.begin(), d_segment_sizes.end());
   const auto output_elements = static_cast<std::size_t>(num_segments) * K;
-  const auto total_num_items = ::cuda::argument::immediate{static_cast<cuda::std::int64_t>(input_elements)};
+  const auto total_num_items = ::cuda::args::immediate{static_cast<cuda::std::int64_t>(input_elements)};
 
   auto in_keys_buffer = gen_data<MaxSegmentSize, K>(
     num_segments, string_to_pattern(state.get_string("Pattern")), thrust::raw_pointer_cast(d_segment_sizes.data()));
   auto out_keys_buffer = thrust::device_vector<KeyT>(output_elements, thrust::no_init);
 
-  auto segment_sizes_param = ::cuda::argument::immediate_sequence{
-    thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::argument::bounds<1, MaxSegmentSize>()};
-  auto k_param            = ::cuda::argument::constant<K>{};
-  auto select_direction   = ::cuda::argument::constant<cub::detail::topk::select::max>{};
-  auto num_segments_param = ::cuda::argument::immediate{static_cast<cuda::std::int64_t>(num_segments)};
+  auto segment_sizes_param = ::cuda::args::__immediate_sequence{
+    thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::args::bounds<1, MaxSegmentSize>()};
+  auto k_param            = ::cuda::args::constant<K>{};
+  auto select_direction   = ::cuda::args::constant<cub::detail::topk::select::max>{};
+  auto num_segments_param = ::cuda::args::immediate{static_cast<cuda::std::int64_t>(num_segments)};
 
   auto d_keys_in = cuda::make_strided_iterator(
     cuda::make_counting_iterator(thrust::raw_pointer_cast(in_keys_buffer.data())),
diff --git a/cub/cub/agent/agent_batched_topk.cuh b/cub/cub/agent/agent_batched_topk.cuh
index bc10311d36d..d2a99cfc809 100644
--- a/cub/cub/agent/agent_batched_topk.cuh
+++ b/cub/cub/agent/agent_batched_topk.cuh
@@ -73,8 +73,8 @@ struct agent_batched_topk_worker_per_segment
   using key_t   = it_value_t<key_it_t>;
   using value_t = it_value_t<value_it_t>;
 
-  using segment_size_val_t = typename ::cuda::argument::__traits<SegmentSizeParameterT>::element_type;
-  using num_segments_val_t = typename ::cuda::argument::__traits<NumSegmentsParameterT>::element_type;
+  using segment_size_val_t = typename ::cuda::args::__traits<SegmentSizeParameterT>::element_type;
+  using num_segments_val_t = typename ::cuda::args::__traits<NumSegmentsParameterT>::element_type;
   using counters_t         = batched_topk_counters<num_segments_val_t>;
 
   static constexpr auto policy                 = PolicyGetter{}();
@@ -95,7 +95,7 @@ struct agent_batched_topk_worker_per_segment
     multi_worker_per_segment_policy.threads_per_block * multi_worker_per_segment_policy.items_per_thread;
 
   // Check if there could be large segments present
-  static constexpr bool only_small_segments = ::cuda::argument::__traits<SegmentSizeParameterT>::highest <= tile_size;
+  static constexpr bool only_small_segments = ::cuda::args::__traits<SegmentSizeParameterT>::highest <= tile_size;
 
   // Check if we are dealing with keys-only or key-value pairs
   static constexpr bool is_keys_only = ::cuda::std::is_same_v<value_t, cub::NullType>;
@@ -196,8 +196,8 @@ struct agent_batched_topk_worker_per_segment
       return;
     }
 
-    constexpr bool is_full_tile = ::cuda::argument::__traits<SegmentSizeParameterT>::is_constant
-                               && ::cuda::argument::__traits<SegmentSizeParameterT>::lowest == tile_size;
+    constexpr bool is_full_tile = ::cuda::args::__traits<SegmentSizeParameterT>::is_constant
+                               && ::cuda::args::__traits<SegmentSizeParameterT>::lowest == tile_size;
 
     // Resolve Segment Parameters
     const auto segment_size = params::get_param(segment_sizes, segment_id);
diff --git a/cub/cub/detail/segmented_params.cuh b/cub/cub/detail/segmented_params.cuh
index e9a999a87b1..624a5da1a23 100644
--- a/cub/cub/detail/segmented_params.cuh
+++ b/cub/cub/detail/segmented_params.cuh
@@ -33,10 +33,10 @@ namespace detail::params
 //! @param[in] __index Segment index to read for sequence arguments.
 //! @return The single argument value, or the sequence element at the given index.
 _CCCL_TEMPLATE(class _Tp, class _SegmentIndexT)
-_CCCL_REQUIRES((!::cuda::argument::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) )
+_CCCL_REQUIRES((!::cuda::args::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) )
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(_Tp&& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
 {
-  if constexpr (::cuda::argument::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value)
+  if constexpr (::cuda::args::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value)
   {
     return __arg;
   }
@@ -46,46 +46,46 @@ _CCCL_REQUIRES((!::cuda::argument::__is_wrapper_v<::cuda::std::remove_cvref_t<_T
   }
 }
 
-template <auto _Value, class _SegmentIndexT>
+template <auto _Value, class _Tp, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
-get_param(const ::cuda::argument::constant<_Value>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
+get_param(const ::cuda::args::constant<_Value, _Tp>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
 {
-  return ::cuda::argument::__unwrap(__arg);
+  return ::cuda::args::__unwrap(__arg);
 }
 
 template <auto _Value, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
-get_param(const ::cuda::argument::constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept
+get_param(const ::cuda::args::__constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept
 {
-  return ::cuda::argument::__unwrap(__arg)[__index];
+  return ::cuda::args::__unwrap(__arg)[__index];
 }
 
 template <class _Arg, class _StaticBounds, class _SegmentIndexT>
-[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(
-  const ::cuda::argument::immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
+[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
+get_param(const ::cuda::args::immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
 {
-  return ::cuda::argument::__unwrap(__arg);
+  return ::cuda::args::__unwrap(__arg);
 }
 
 template <class _Arg, class _StaticBounds, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
-get_param(const ::cuda::argument::immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
+get_param(const ::cuda::args::__immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
 {
-  return ::cuda::argument::__unwrap(__arg)[__index];
+  return ::cuda::args::__unwrap(__arg)[__index];
 }
 
 template <class _Arg, class _StaticBounds, class _SegmentIndexT>
-[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(
-  const ::cuda::argument::deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
+[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
+get_param(const ::cuda::args::deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
 {
-  return ::cuda::argument::__unwrap(__arg);
+  return ::cuda::args::__unwrap(__arg);
 }
 
 template <class _Arg, class _StaticBounds, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
-get_param(const ::cuda::argument::deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
+get_param(const ::cuda::args::deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
 {
-  return ::cuda::argument::__unwrap(__arg)[__index];
+  return ::cuda::args::__unwrap(__arg)[__index];
 }
 
 // =====================================================================
diff --git a/cub/cub/device/dispatch/dispatch_batched_topk.cuh b/cub/cub/device/dispatch/dispatch_batched_topk.cuh
index 68a3a4f26b0..b1151f0dc4a 100644
--- a/cub/cub/device/dispatch/dispatch_batched_topk.cuh
+++ b/cub/cub/device/dispatch/dispatch_batched_topk.cuh
@@ -50,8 +50,8 @@ namespace detail::batched_topk
 // -----------------------------------------------------------------------------
 
 // Uniform (compile-time): constant<Dir> -> single-option uniform_discrete_param.
-template <detail::topk::select Dir>
-[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::argument::constant<Dir>)
+template <detail::topk::select Dir, class _Tp>
+[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::args::constant<Dir, _Tp>)
 {
   return params::uniform_discrete_param<detail::topk::select, Dir>{Dir};
 }
@@ -126,7 +126,7 @@ template <typename KeyInputItItT,
           typename PolicySelector = policy_selector_from_types<it_value_t<it_value_t<KeyInputItItT>>,
                                                                it_value_t<it_value_t<ValueInputItItT>>,
                                                                ::cuda::std::int64_t,
-                                                               ::cuda::argument::__traits<KParameterT>::highest>>
+                                                               ::cuda::args::__traits<KParameterT>::highest>>
 #if _CCCL_HAS_CONCEPTS()
   requires batched_topk_policy_selector<PolicySelector>
 #endif // _CCCL_HAS_CONCEPTS()
@@ -145,7 +145,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   cudaStream_t stream                             = nullptr,
   [[maybe_unused]] PolicySelector policy_selector = {})
 {
-  using large_segment_tile_offset_t = typename ::cuda::argument::__traits<TotalNumItemsGuaranteeT>::element_type;
+  using large_segment_tile_offset_t = typename ::cuda::args::__traits<TotalNumItemsGuaranteeT>::element_type;
 
   // Wrap the raw enum into the internal discrete param type
   auto select_directions          = wrap_select_direction(select_direction);
@@ -171,9 +171,9 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   static constexpr int worker_per_segment_tile_size =
     worker_per_segment_policy.threads_per_block * worker_per_segment_policy.items_per_thread;
   static constexpr bool any_small_segments =
-    ::cuda::argument::__traits<SegmentSizeParameterT>::lowest <= worker_per_segment_tile_size;
+    ::cuda::args::__traits<SegmentSizeParameterT>::lowest <= worker_per_segment_tile_size;
   static constexpr bool only_small_segments =
-    ::cuda::argument::__traits<SegmentSizeParameterT>::highest <= worker_per_segment_tile_size;
+    ::cuda::args::__traits<SegmentSizeParameterT>::highest <= worker_per_segment_tile_size;
 
   // Allocation layout:
   //   only_small_segments: [0] dummy.
@@ -183,7 +183,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   static constexpr int allocations_array_size     = only_small_segments ? 1 : (any_small_segments ? 3 : 2);
   size_t allocation_sizes[allocations_array_size] = {1};
 
-  using num_segments_val_t         = typename ::cuda::argument::__traits<NumSegmentsParameterT>::element_type;
+  using num_segments_val_t         = typename ::cuda::args::__traits<NumSegmentsParameterT>::element_type;
   using counters_t                 = batched_topk_counters<num_segments_val_t>;
   using segment_size_scan_offset_t = detail::choose_offset_t<num_segments_val_t>;
   using segment_size_scan_input_op_t =
@@ -239,7 +239,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
 
   // TODO (elstehle): support number of segments provided by device-accessible iterator
   // Only uniform number of segments are supported (i.e., we need to resolve the number of segments on the host)
-  static_assert(::cuda::argument::__traits<NumSegmentsParameterT>::is_single_value,
+  static_assert(::cuda::args::__traits<NumSegmentsParameterT>::is_single_value,
                 "Only uniform segment sizes are currently supported.");
 
   if constexpr (any_small_segments)
@@ -341,7 +341,7 @@ template <typename KeyInputItItT,
     policy_selector_from_types<it_value_t<it_value_t<KeyInputItItT>>,
                                it_value_t<it_value_t<ValueInputItItT>>,
                                ::cuda::std::int64_t,
-                               ::cuda::argument::__traits<KParameterT>::highest>;
+                               ::cuda::args::__traits<KParameterT>::highest>;
   return detail::dispatch_with_env_and_tuning<default_policy_selector>(
     env, [&](auto policy_selector, void* d_temp_storage, size_t& temp_storage_bytes, cudaStream_t stream) {
       return dispatch(
diff --git a/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh b/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh
index 9c904f95971..3412f40359a 100644
--- a/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh
+++ b/cub/cub/device/dispatch/kernels/kernel_batched_topk.cuh
@@ -39,7 +39,7 @@ private:
     worker_policy worker_per_segment_policy;
     multi_worker_policy multi_worker_per_segment_policy;
   };
-  static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::argument::__traits<SegmentSizeParameterT>::highest;
+  static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::args::__traits<SegmentSizeParameterT>::highest;
   static constexpr batched_topk_policy active_policy     = current_policy<PolicySelector>();
 
   template <int Index>
@@ -133,8 +133,8 @@ __launch_bounds__(int(
     KParameterT k,
     SelectDirectionParameterT select_directions,
     NumSegmentsParameterT num_segments,
-    batched_topk_counters<typename ::cuda::argument::__traits<NumSegmentsParameterT>::element_type>* d_counters,
-    typename ::cuda::argument::__traits<NumSegmentsParameterT>::element_type* d_large_segments_ids,
+    batched_topk_counters<typename ::cuda::args::__traits<NumSegmentsParameterT>::element_type>* d_counters,
+    typename ::cuda::args::__traits<NumSegmentsParameterT>::element_type* d_large_segments_ids,
     LargeSegmentTileOffsetT* d_large_segments_tile_offsets)
 {
   using agent_t = typename find_smallest_covering_policy<
@@ -151,7 +151,7 @@ __launch_bounds__(int(
     LargeSegmentTileOffsetT>::agent_t;
 
   // Static Assertions (Constraints)
-  static_assert(agent_t::tile_size >= ::cuda::argument::__traits<SegmentSizeParameterT>::highest,
+  static_assert(agent_t::tile_size >= ::cuda::args::__traits<SegmentSizeParameterT>::highest,
                 "Block size exceeds maximum segment size supported by SegmentSizeParameterT");
   static_assert(sizeof(typename agent_t::TempStorage) <= max_smem_per_block,
                 "Static shared memory per block must not exceed 48KB limit.");
diff --git a/cub/test/catch2_test_device_segmented_topk_keys.cu b/cub/test/catch2_test_device_segmented_topk_keys.cu
index e16d5637f19..433e6b1a35a 100644
--- a/cub/test/catch2_test_device_segmented_topk_keys.cu
+++ b/cub/test/catch2_test_device_segmented_topk_keys.cu
@@ -151,11 +151,11 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small fixed-size segments",
   batched_topk_keys(
     d_keys_in,
     d_keys_out,
-    ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds<segment_size_t{1}, max_segment_size>()},
-    ::cuda::argument::immediate{k, ::cuda::argument::bounds<segment_size_t{1}, static_max_k>()},
+    ::cuda::args::immediate{segment_size, ::cuda::args::bounds<segment_size_t{1}, max_segment_size>()},
+    ::cuda::args::immediate{k, ::cuda::args::bounds<segment_size_t{1}, static_max_k>()},
     direction,
-    ::cuda::argument::immediate{num_segments},
-    ::cuda::argument::immediate{num_segments * segment_size});
+    ::cuda::args::immediate{num_segments},
+    ::cuda::args::immediate{num_segments * segment_size});
   // Prepare expected results
   fixed_size_segmented_sort_keys(expected_keys, num_segments, segment_size, direction);
   compact_sorted_keys_to_topk(expected_keys, segment_size, k);
@@ -248,12 +248,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small variable-size segment
   batched_topk_keys(
     d_keys_in,
     d_keys_out,
-    ::cuda::argument::immediate_sequence{
-      segment_size_it, ::cuda::argument::bounds<segment_size_t{1}, static_max_segment_size>()},
-    ::cuda::argument::immediate{k, ::cuda::argument::bounds<segment_size_t{1}, static_max_k>()},
+    ::cuda::args::__immediate_sequence{
+      segment_size_it, ::cuda::args::bounds<segment_size_t{1}, static_max_segment_size>()},
+    ::cuda::args::immediate{k, ::cuda::args::bounds<segment_size_t{1}, static_max_k>()},
     direction,
-    ::cuda::argument::immediate{num_segments},
-    ::cuda::argument::immediate{num_items});
+    ::cuda::args::immediate{num_segments},
+    ::cuda::args::immediate{num_items});
 
   // Verify keys are returned correctly: sort each segment of the expected input, then compact the top-k
   segmented_sort_keys(expected_keys, num_segments, segment_offsets.cbegin(), segment_offsets.cbegin() + 1, direction);
@@ -286,11 +286,11 @@ C2H_TEST("DeviceBatchedTopK::MinKeys preserves -0.0f in output", "[keys][segment
   batched_topk_keys(
     d_keys_in_it,
     d_keys_out_it,
-    ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds<cuda::std::int64_t{1}, max_segment_size>()},
-    ::cuda::argument::immediate{k, ::cuda::argument::bounds<cuda::std::int64_t{1}, k>()},
+    ::cuda::args::immediate{segment_size, ::cuda::args::bounds<cuda::std::int64_t{1}, max_segment_size>()},
+    ::cuda::args::immediate{k, ::cuda::args::bounds<cuda::std::int64_t{1}, k>()},
     cub::detail::topk::select::min,
-    ::cuda::argument::immediate{num_segments},
-    ::cuda::argument::immediate{num_segments * segment_size});
+    ::cuda::args::immediate{num_segments},
+    ::cuda::args::immediate{num_segments * segment_size});
 
   const int num_minus_zero = static_cast<int>(thrust::count_if(d_keys_out.begin(), d_keys_out.end(), is_minus_zero{}));
   REQUIRE(num_minus_zero >= 1);
diff --git a/cub/test/catch2_test_device_segmented_topk_pairs.cu b/cub/test/catch2_test_device_segmented_topk_pairs.cu
index eb2825f15d2..e7041e445c6 100644
--- a/cub/test/catch2_test_device_segmented_topk_pairs.cu
+++ b/cub/test/catch2_test_device_segmented_topk_pairs.cu
@@ -220,11 +220,11 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small fixed-size segments"
     d_keys_out,
     d_values_in,
     d_values_out,
-    ::cuda::argument::immediate{segment_size, ::cuda::argument::bounds<segment_size_t{1}, max_segment_size>()},
-    ::cuda::argument::immediate{k, ::cuda::argument::bounds<segment_size_t{1}, static_max_k>()},
+    ::cuda::args::immediate{segment_size, ::cuda::args::bounds<segment_size_t{1}, max_segment_size>()},
+    ::cuda::args::immediate{k, ::cuda::args::bounds<segment_size_t{1}, static_max_k>()},
     direction,
-    ::cuda::argument::immediate{num_segments},
-    ::cuda::argument::immediate{num_segments * segment_size});
+    ::cuda::args::immediate{num_segments},
+    ::cuda::args::immediate{num_segments * segment_size});
 
   // Verification:
   // - We verify correct top-k selection through the keys
@@ -340,12 +340,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Pairs work with small variable-size segmen
     d_keys_out,
     d_values_in,
     d_values_out,
-    ::cuda::argument::immediate_sequence{
-      segment_size_it, ::cuda::argument::bounds<segment_size_t{1}, static_max_segment_size>()},
-    ::cuda::argument::immediate{k, ::cuda::argument::bounds<segment_size_t{1}, static_max_k>()},
+    ::cuda::args::__immediate_sequence{
+      segment_size_it, ::cuda::args::bounds<segment_size_t{1}, static_max_segment_size>()},
+    ::cuda::args::immediate{k, ::cuda::args::bounds<segment_size_t{1}, static_max_k>()},
     direction,
-    ::cuda::argument::immediate{num_segments},
-    ::cuda::argument::immediate{num_items});
+    ::cuda::args::immediate{num_segments},
+    ::cuda::args::immediate{num_items});
 
   // Verification:
   // - We verify correct top-k selection through the keys
diff --git a/libcudacxx/include/cuda/__argument/argument.h b/libcudacxx/include/cuda/__argument/argument.h
index 8c1933bdc7b..5e9d14a3bf5 100644
--- a/libcudacxx/include/cuda/__argument/argument.h
+++ b/libcudacxx/include/cuda/__argument/argument.h
@@ -44,6 +44,8 @@
 
 _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT
 
+struct __access;
+
 // =====================================================================
 // __element_type_of
 // =====================================================================
@@ -102,31 +104,28 @@ inline constexpr bool __is_iterable_v<_Tp,
 // spelling carries that intent.
 
 //! @brief Wraps a compile-time constant argument value.
-template <auto _Value>
-struct constant
+template <auto _Value, class _Tp = decltype(_Value)>
+class constant
 {
-  using value_type     = ::cuda::std::remove_cvref_t<decltype(_Value)>;
+public:
+  using value_type     = ::cuda::std::remove_cvref_t<_Tp>;
   using __element_type = value_type;
 
-  [[nodiscard]] _CCCL_API static constexpr value_type value() noexcept
+  [[nodiscard]] _CCCL_API static constexpr value_type __get_value() noexcept
   {
-    return _Value;
+    return static_cast<value_type>(_Value);
   }
 };
 
 //! @brief Wraps a compile-time constant argument sequence.
 template <auto _Value>
-struct constant_sequence
+class __constant_sequence
 {
+public:
   using value_type     = ::cuda::std::remove_cvref_t<decltype(_Value)>;
   using __element_type = __element_type_of_t<value_type>;
 
   static_assert(__is_sequence_v<value_type>, "constant sequence arguments must have a distinct element type");
-
-  [[nodiscard]] _CCCL_API static constexpr value_type value() noexcept
-  {
-    return _Value;
-  }
 };
 
 // =====================================================================
@@ -231,7 +230,7 @@ template <class _ElementType, class _StaticBounds>
 _CCCL_API constexpr void __validate_bounds_intersection(runtime_bounds<_ElementType> __runtime_bounds) noexcept
 {
   static_assert(__valid_static_bounds_v<_ElementType, _StaticBounds>,
-                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with "
                 "values representable by the element type");
   _CCCL_VERIFY((__has_bounds_intersection<_ElementType, _StaticBounds>(__runtime_bounds)),
                "static and runtime argument bounds do not intersect");
@@ -265,17 +264,20 @@ _CCCL_API constexpr void __validate_runtime_element_bounds(
 //!
 //! The value is host-accessible at API call time.
 template <class _Arg, class _StaticBounds = no_bounds>
-struct immediate
+class immediate
 {
+public:
   using __element_type = __element_type_of_t<_Arg>;
 
   static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>,
-                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with "
                 "values representable by the element type");
 
+private:
+  friend struct __access;
+
   _Arg __arg_;
 
-private:
   _CCCL_API constexpr void __validate_value() const noexcept
   {
     if constexpr (::cuda::std::is_same_v<::cuda::std::remove_cvref_t<_Arg>, __element_type>
@@ -307,24 +309,27 @@ _CCCL_HOST_DEVICE immediate(_Arg, static_bounds<_Lowest, _Highest>)
 #endif // _CCCL_DOXYGEN_INVOKED
 
 // =====================================================================
-// immediate_sequence
+// __immediate_sequence
 // =====================================================================
 
 //! @brief Wraps a runtime argument sequence with optional bounds.
 template <class _Arg, class _StaticBounds = no_bounds>
-struct immediate_sequence
+class __immediate_sequence
 {
+public:
   using __element_type = __element_type_of_t<_Arg>;
 
   static_assert(__is_sequence_v<_Arg>, "immediate sequence arguments must have a distinct element type");
   static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>,
-                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with "
                 "values representable by the element type");
 
+private:
+  friend struct __access;
+
   _Arg __arg_;
   runtime_bounds<__element_type> __runtime_bounds_{};
 
-private:
   _CCCL_API constexpr void __validate_bounds() const noexcept
   {
     __validate_bounds_intersection<__element_type, _StaticBounds>(__runtime_bounds_);
@@ -348,14 +353,14 @@ struct immediate_sequence
   }
 
 public:
-  _CCCL_API constexpr immediate_sequence(_Arg __arg) noexcept
+  _CCCL_API constexpr __immediate_sequence(_Arg __arg) noexcept
       : __arg_{::cuda::std::move(__arg)}
   {
     __validate_bounds();
     __validate_value();
   }
 
-  _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds) noexcept
+  _CCCL_API constexpr __immediate_sequence(_Arg __arg, _StaticBounds) noexcept
       : __arg_{::cuda::std::move(__arg)}
   {
     __validate_bounds();
@@ -363,7 +368,7 @@ struct immediate_sequence
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb) noexcept
+  _CCCL_API constexpr __immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb) noexcept
       : __arg_{::cuda::std::move(__arg)}
       , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()),
                           __runtime_bound_cast<__element_type>(__rb.upper())}
@@ -373,7 +378,7 @@ struct immediate_sequence
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr immediate_sequence(_Arg __arg, _StaticBounds, runtime_bounds<_BoundsTp> __rb) noexcept
+  _CCCL_API constexpr __immediate_sequence(_Arg __arg, _StaticBounds, runtime_bounds<_BoundsTp> __rb) noexcept
       : __arg_{::cuda::std::move(__arg)}
       , __runtime_bounds_{__runtime_bound_cast<__element_type>(__rb.lower()),
                           __runtime_bound_cast<__element_type>(__rb.upper())}
@@ -383,23 +388,23 @@ struct immediate_sequence
   }
 
   template <class _BoundsTp>
-  _CCCL_API constexpr immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept
-      : immediate_sequence(::cuda::std::move(__arg), __sb, __rb)
+  _CCCL_API constexpr __immediate_sequence(_Arg __arg, runtime_bounds<_BoundsTp> __rb, _StaticBounds __sb) noexcept
+      : __immediate_sequence(::cuda::std::move(__arg), __sb, __rb)
   {}
 };
 
 #ifndef _CCCL_DOXYGEN_INVOKED
 template <class _Arg, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>)
-  -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE __immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>)
+  -> __immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, auto _Lowest, auto _Highest, class _Tp>
-_CCCL_HOST_DEVICE immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>, runtime_bounds<_Tp>)
-  -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE __immediate_sequence(_Arg, static_bounds<_Lowest, _Highest>, runtime_bounds<_Tp>)
+  -> __immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
 
 template <class _Arg, class _Tp, auto _Lowest, auto _Highest>
-_CCCL_HOST_DEVICE immediate_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Highest>)
-  -> immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
+_CCCL_HOST_DEVICE __immediate_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Highest>)
+  -> __immediate_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
 #endif // _CCCL_DOXYGEN_INVOKED
 
 // =====================================================================
@@ -408,17 +413,22 @@ _CCCL_HOST_DEVICE immediate_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_L
 
 //! @brief Common base for deferred argument wrappers.
 template <class _Arg, class _StaticBounds = no_bounds>
-struct __deferred_base
+class __deferred_base
 {
+public:
   using __element_type = __element_type_of_t<_Arg>;
 
   static_assert(__valid_static_bounds_v<__element_type, _StaticBounds>,
-                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with "
                 "values representable by the element type");
 
+private:
+  friend struct __access;
+
   _Arg __arg_;
   runtime_bounds<__element_type> __runtime_bounds_{};
 
+public:
   _CCCL_API constexpr __deferred_base(_Arg __arg) noexcept
       : __arg_{::cuda::std::move(__arg)}
   {
@@ -458,8 +468,9 @@ struct __deferred_base
 //! @brief Wraps a reference to a single value that is potentially not available at API call time but will be available
 //! by the time the argument is consumed in stream order.
 template <class _Arg, class _StaticBounds = no_bounds>
-struct deferred : __deferred_base<_Arg, _StaticBounds>
+class deferred : public __deferred_base<_Arg, _StaticBounds>
 {
+public:
   using __deferred_base<_Arg, _StaticBounds>::__deferred_base;
 };
 
@@ -485,8 +496,9 @@ _CCCL_HOST_DEVICE deferred(_Arg, runtime_bounds<_Tp>, static_bounds<_Lowest, _Hi
 //! @brief Wraps a reference to a sequence of values that is potentially not available at API call time but will be
 //! available by the time the argument is consumed in stream order.
 template <class _Arg, class _StaticBounds = no_bounds>
-struct deferred_sequence : __deferred_base<_Arg, _StaticBounds>
+class deferred_sequence : public __deferred_base<_Arg, _StaticBounds>
 {
+public:
   static_assert(__is_sequence_v<_Arg>, "deferred sequence arguments must have a distinct element type");
 
   using __deferred_base<_Arg, _StaticBounds>::__deferred_base;
@@ -512,6 +524,97 @@ _CCCL_HOST_DEVICE deferred_sequence(_Arg, runtime_bounds<_Tp>, static_bounds<_Lo
   -> deferred_sequence<_Arg, static_bounds<_Lowest, _Highest>>;
 #endif // _CCCL_DOXYGEN_INVOKED
 
+// =====================================================================
+// __access
+// =====================================================================
+
+struct __access
+{
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr _Arg& __arg(immediate<_Arg, _StaticBounds>& __wrapper) noexcept
+  {
+    return __wrapper.__arg_;
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr const _Arg& __arg(const immediate<_Arg, _StaticBounds>& __wrapper) noexcept
+  {
+    return __wrapper.__arg_;
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr _Arg&& __arg(immediate<_Arg, _StaticBounds>&& __wrapper) noexcept
+  {
+    return ::cuda::std::move(__wrapper.__arg_);
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr _Arg& __arg(__immediate_sequence<_Arg, _StaticBounds>& __wrapper) noexcept
+  {
+    return __wrapper.__arg_;
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr const _Arg&
+  __arg(const __immediate_sequence<_Arg, _StaticBounds>& __wrapper) noexcept
+  {
+    return __wrapper.__arg_;
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr _Arg&& __arg(__immediate_sequence<_Arg, _StaticBounds>&& __wrapper) noexcept
+  {
+    return ::cuda::std::move(__wrapper.__arg_);
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr _Arg& __arg(__deferred_base<_Arg, _StaticBounds>& __wrapper) noexcept
+  {
+    return __wrapper.__arg_;
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr const _Arg&
+  __arg(const __deferred_base<_Arg, _StaticBounds>& __wrapper) noexcept
+  {
+    return __wrapper.__arg_;
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr _Arg&& __arg(__deferred_base<_Arg, _StaticBounds>&& __wrapper) noexcept
+  {
+    return ::cuda::std::move(__wrapper.__arg_);
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr runtime_bounds<__element_type_of_t<_Arg>>&
+  __runtime_bounds(__immediate_sequence<_Arg, _StaticBounds>& __wrapper) noexcept
+  {
+    return __wrapper.__runtime_bounds_;
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr const runtime_bounds<__element_type_of_t<_Arg>>&
+  __runtime_bounds(const __immediate_sequence<_Arg, _StaticBounds>& __wrapper) noexcept
+  {
+    return __wrapper.__runtime_bounds_;
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr runtime_bounds<__element_type_of_t<_Arg>>&
+  __runtime_bounds(__deferred_base<_Arg, _StaticBounds>& __wrapper) noexcept
+  {
+    return __wrapper.__runtime_bounds_;
+  }
+
+  template <class _Arg, class _StaticBounds>
+  [[nodiscard]] _CCCL_API static constexpr const runtime_bounds<__element_type_of_t<_Arg>>&
+  __runtime_bounds(const __deferred_base<_Arg, _StaticBounds>& __wrapper) noexcept
+  {
+    return __wrapper.__runtime_bounds_;
+  }
+};
+
 // =====================================================================
 // __unwrap
 // =====================================================================
@@ -520,12 +623,12 @@ template <class _Tp>
 inline constexpr bool __is_wrapper_v = false;
 template <class _Arg, class _StaticBounds>
 inline constexpr bool __is_wrapper_v<immediate<_Arg, _StaticBounds>> = true;
+template <auto _Value, class _Tp>
+inline constexpr bool __is_wrapper_v<constant<_Value, _Tp>> = true;
 template <auto _Value>
-inline constexpr bool __is_wrapper_v<constant<_Value>> = true;
-template <auto _Value>
-inline constexpr bool __is_wrapper_v<constant_sequence<_Value>> = true;
+inline constexpr bool __is_wrapper_v<__constant_sequence<_Value>> = true;
 template <class _Arg, class _StaticBounds>
-inline constexpr bool __is_wrapper_v<immediate_sequence<_Arg, _StaticBounds>> = true;
+inline constexpr bool __is_wrapper_v<__immediate_sequence<_Arg, _StaticBounds>> = true;
 template <class _Arg, class _StaticBounds>
 inline constexpr bool __is_wrapper_v<deferred<_Arg, _StaticBounds>> = true;
 template <class _Arg, class _StaticBounds>
@@ -541,99 +644,99 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) )
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(immediate<_Arg, _StaticBounds>& __arg) noexcept
 {
-  return __arg.__arg_;
+  return __access::__arg(__arg);
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const immediate<_Arg, _StaticBounds>& __arg) noexcept
 {
-  return __arg.__arg_;
+  return __access::__arg(__arg);
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr _Arg __unwrap(immediate<_Arg, _StaticBounds>&& __arg) noexcept
 {
-  return ::cuda::std::move(__arg.__arg_);
+  return __access::__arg(::cuda::std::move(__arg));
 }
 
-template <auto _Value>
-[[nodiscard]] _CCCL_API constexpr ::cuda::std::remove_cvref_t<decltype(_Value)>
-__unwrap(const constant<_Value>&) noexcept
+template <auto _Value, class _Tp>
+[[nodiscard]] _CCCL_API constexpr typename constant<_Value, _Tp>::value_type
+__unwrap(const constant<_Value, _Tp>&) noexcept
 {
-  return _Value;
+  return constant<_Value, _Tp>::__get_value();
 }
 
 template <auto _Value>
 [[nodiscard]] _CCCL_API constexpr ::cuda::std::remove_cvref_t<decltype(_Value)>
-__unwrap(const constant_sequence<_Value>&) noexcept
+__unwrap(const __constant_sequence<_Value>&) noexcept
 {
   return _Value;
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(__immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept
 {
-  return __arg.__arg_;
+  return __access::__arg(__arg);
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const __immediate_sequence<_Arg, _StaticBounds>& __arg) noexcept
 {
-  return __arg.__arg_;
+  return __access::__arg(__arg);
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(immediate_sequence<_Arg, _StaticBounds>&& __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr _Arg __unwrap(__immediate_sequence<_Arg, _StaticBounds>&& __arg) noexcept
 {
-  return ::cuda::std::move(__arg.__arg_);
+  return __access::__arg(::cuda::std::move(__arg));
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(deferred<_Arg, _StaticBounds>& __arg) noexcept
 {
-  return __arg.__arg_;
+  return __access::__arg(__arg);
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const deferred<_Arg, _StaticBounds>& __arg) noexcept
 {
-  return __arg.__arg_;
+  return __access::__arg(__arg);
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr _Arg __unwrap(deferred<_Arg, _StaticBounds>&& __arg) noexcept
 {
-  return ::cuda::std::move(__arg.__arg_);
+  return __access::__arg(::cuda::std::move(__arg));
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr _Arg& __unwrap(deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept
 {
-  return __arg.__arg_;
+  return __access::__arg(__arg);
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr const _Arg& __unwrap(const deferred_sequence<_Arg, _StaticBounds>& __arg) noexcept
 {
-  return __arg.__arg_;
+  return __access::__arg(__arg);
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr _Arg __unwrap(deferred_sequence<_Arg, _StaticBounds>&& __arg) noexcept
 {
-  return ::cuda::std::move(__arg.__arg_);
+  return __access::__arg(::cuda::std::move(__arg));
 }
 
-template <auto _Value>
+template <auto _Value, class _Tp>
 _CCCL_API constexpr auto __constant_compute_lowest() noexcept
 {
-  return _Value;
+  return constant<_Value, _Tp>::__get_value();
 }
 
-template <auto _Value>
+template <auto _Value, class _Tp>
 _CCCL_API constexpr auto __constant_compute_highest() noexcept
 {
-  return _Value;
+  return constant<_Value, _Tp>::__get_value();
 }
 
 template <auto _Value>
@@ -684,16 +787,16 @@ struct __traits_impl
   static constexpr element_type highest = (::cuda::std::numeric_limits<element_type>::max)();
 };
 
-template <auto _Value>
-struct __traits_impl<constant<_Value>>
+template <auto _Value, class _Tp>
+struct __traits_impl<constant<_Value, _Tp>>
 {
-  using value_type                      = ::cuda::std::remove_cvref_t<decltype(_Value)>;
+  using value_type                      = typename constant<_Value, _Tp>::value_type;
   using element_type                    = value_type;
   static constexpr bool is_constant     = true;
   static constexpr bool is_deferred     = false;
   static constexpr bool is_single_value = true;
-  static constexpr element_type lowest  = __constant_compute_lowest<_Value>();
-  static constexpr element_type highest = __constant_compute_highest<_Value>();
+  static constexpr element_type lowest  = __constant_compute_lowest<_Value, _Tp>();
+  static constexpr element_type highest = __constant_compute_highest<_Value, _Tp>();
 };
 
 template <class _Arg, class _StaticBounds>
@@ -702,7 +805,7 @@ struct __traits_impl<immediate<_Arg, _StaticBounds>>
   using value_type   = _Arg;
   using element_type = __element_type_of_t<_Arg>;
   static_assert(__valid_static_bounds_v<element_type, _StaticBounds>,
-                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with "
                 "values representable by the element type");
 
   static constexpr bool is_constant     = false;
@@ -713,7 +816,7 @@ struct __traits_impl<immediate<_Arg, _StaticBounds>>
 };
 
 template <auto _Value>
-struct __traits_impl<constant_sequence<_Value>>
+struct __traits_impl<__constant_sequence<_Value>>
 {
   using value_type   = ::cuda::std::remove_cvref_t<decltype(_Value)>;
   using element_type = __element_type_of_t<value_type>;
@@ -726,13 +829,13 @@ struct __traits_impl<constant_sequence<_Value>>
 };
 
 template <class _Arg, class _StaticBounds>
-struct __traits_impl<immediate_sequence<_Arg, _StaticBounds>>
+struct __traits_impl<__immediate_sequence<_Arg, _StaticBounds>>
 {
   using value_type   = _Arg;
   using element_type = __element_type_of_t<_Arg>;
   static_assert(__is_sequence_v<value_type>, "immediate sequence arguments must have a distinct element type");
   static_assert(__valid_static_bounds_v<element_type, _StaticBounds>,
-                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with "
                 "values representable by the element type");
 
   static constexpr bool is_constant     = false;
@@ -748,7 +851,7 @@ struct __traits_impl<deferred<_Arg, _StaticBounds>>
   using value_type   = _Arg;
   using element_type = __element_type_of_t<_Arg>;
   static_assert(__valid_static_bounds_v<element_type, _StaticBounds>,
-                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with "
                 "values representable by the element type");
 
   static constexpr bool is_constant     = false;
@@ -765,7 +868,7 @@ struct __traits_impl<deferred_sequence<_Arg, _StaticBounds>>
   using element_type = __element_type_of_t<_Arg>;
   static_assert(__is_sequence_v<value_type>, "deferred sequence arguments must have a distinct element type");
   static_assert(__valid_static_bounds_v<element_type, _StaticBounds>,
-                "argument wrapper bounds type must be cuda::argument::no_bounds or cuda::argument::static_bounds with "
+                "argument wrapper bounds type must be cuda::args::no_bounds or cuda::args::static_bounds with "
                 "values representable by the element type");
 
   static constexpr bool is_constant     = false;
@@ -791,14 +894,14 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cv_t<_Tp>>) )
   return ::cuda::std::numeric_limits<__element_type_of_t<_Tp>>::lowest();
 }
 
-template <auto _Value>
-[[nodiscard]] _CCCL_API constexpr auto __lowest_(constant<_Value>) noexcept
+template <auto _Value, class _Tp>
+[[nodiscard]] _CCCL_API constexpr auto __lowest_(constant<_Value, _Tp>) noexcept
 {
-  return __constant_compute_lowest<_Value>();
+  return __constant_compute_lowest<_Value, _Tp>();
 }
 
 template <auto _Value>
-[[nodiscard]] _CCCL_API constexpr auto __lowest_(constant_sequence<_Value>) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __lowest_(__constant_sequence<_Value>) noexcept
 {
   return __constant_sequence_compute_lowest<_Value>();
 }
@@ -806,31 +909,34 @@ template <auto _Value>
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr auto __lowest_(immediate<_Arg, _StaticBounds> __arg) noexcept
 {
-  return __arg.__arg_;
+  return __access::__arg(__arg);
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr auto __lowest_(immediate_sequence<_Arg, _StaticBounds> __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __lowest_(__immediate_sequence<_Arg, _StaticBounds> __arg) noexcept
 {
-  using _ET = __element_type_of_t<_Arg>;
-  __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
-  return __effective_lowest<_ET, _StaticBounds>(__arg.__runtime_bounds_);
+  using _ET                    = __element_type_of_t<_Arg>;
+  const auto& __runtime_bounds = __access::__runtime_bounds(__arg);
+  __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds);
+  return __effective_lowest<_ET, _StaticBounds>(__runtime_bounds);
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr auto __lowest_(deferred<_Arg, _StaticBounds> __arg) noexcept
 {
-  using _ET = __element_type_of_t<_Arg>;
-  __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
-  return __effective_lowest<_ET, _StaticBounds>(__arg.__runtime_bounds_);
+  using _ET                    = __element_type_of_t<_Arg>;
+  const auto& __runtime_bounds = __access::__runtime_bounds(__arg);
+  __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds);
+  return __effective_lowest<_ET, _StaticBounds>(__runtime_bounds);
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr auto __lowest_(deferred_sequence<_Arg, _StaticBounds> __arg) noexcept
 {
-  using _ET = __element_type_of_t<_Arg>;
-  __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
-  return __effective_lowest<_ET, _StaticBounds>(__arg.__runtime_bounds_);
+  using _ET                    = __element_type_of_t<_Arg>;
+  const auto& __runtime_bounds = __access::__runtime_bounds(__arg);
+  __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds);
+  return __effective_lowest<_ET, _StaticBounds>(__runtime_bounds);
 }
 
 //! @brief Returns the effective highest bound, combining static and runtime bounds.
@@ -841,14 +947,14 @@ _CCCL_REQUIRES((!__is_wrapper_v<::cuda::std::remove_cv_t<_Tp>>) )
   return (::cuda::std::numeric_limits<__element_type_of_t<_Tp>>::max)();
 }
 
-template <auto _Value>
-[[nodiscard]] _CCCL_API constexpr auto __highest_(constant<_Value>) noexcept
+template <auto _Value, class _Tp>
+[[nodiscard]] _CCCL_API constexpr auto __highest_(constant<_Value, _Tp>) noexcept
 {
-  return __constant_compute_highest<_Value>();
+  return __constant_compute_highest<_Value, _Tp>();
 }
 
 template <auto _Value>
-[[nodiscard]] _CCCL_API constexpr auto __highest_(constant_sequence<_Value>) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __highest_(__constant_sequence<_Value>) noexcept
 {
   return __constant_sequence_compute_highest<_Value>();
 }
@@ -856,31 +962,34 @@ template <auto _Value>
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr auto __highest_(immediate<_Arg, _StaticBounds> __arg) noexcept
 {
-  return __arg.__arg_;
+  return __access::__arg(__arg);
 }
 
 template <class _Arg, class _StaticBounds>
-[[nodiscard]] _CCCL_API constexpr auto __highest_(immediate_sequence<_Arg, _StaticBounds> __arg) noexcept
+[[nodiscard]] _CCCL_API constexpr auto __highest_(__immediate_sequence<_Arg, _StaticBounds> __arg) noexcept
 {
-  using _ET = __element_type_of_t<_Arg>;
-  __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
-  return __effective_highest<_ET, _StaticBounds>(__arg.__runtime_bounds_);
+  using _ET                    = __element_type_of_t<_Arg>;
+  const auto& __runtime_bounds = __access::__runtime_bounds(__arg);
+  __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds);
+  return __effective_highest<_ET, _StaticBounds>(__runtime_bounds);
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr auto __highest_(deferred<_Arg, _StaticBounds> __arg) noexcept
 {
-  using _ET = __element_type_of_t<_Arg>;
-  __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
-  return __effective_highest<_ET, _StaticBounds>(__arg.__runtime_bounds_);
+  using _ET                    = __element_type_of_t<_Arg>;
+  const auto& __runtime_bounds = __access::__runtime_bounds(__arg);
+  __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds);
+  return __effective_highest<_ET, _StaticBounds>(__runtime_bounds);
 }
 
 template <class _Arg, class _StaticBounds>
 [[nodiscard]] _CCCL_API constexpr auto __highest_(deferred_sequence<_Arg, _StaticBounds> __arg) noexcept
 {
-  using _ET = __element_type_of_t<_Arg>;
-  __validate_bounds_intersection<_ET, _StaticBounds>(__arg.__runtime_bounds_);
-  return __effective_highest<_ET, _StaticBounds>(__arg.__runtime_bounds_);
+  using _ET                    = __element_type_of_t<_Arg>;
+  const auto& __runtime_bounds = __access::__runtime_bounds(__arg);
+  __validate_bounds_intersection<_ET, _StaticBounds>(__runtime_bounds);
+  return __effective_highest<_ET, _StaticBounds>(__runtime_bounds);
 }
 
 _CCCL_END_NAMESPACE_CUDA_ARGUMENT
diff --git a/libcudacxx/include/cuda/__argument/argument_bounds.h b/libcudacxx/include/cuda/__argument/argument_bounds.h
index 630ae897910..4f5429936dc 100644
--- a/libcudacxx/include/cuda/__argument/argument_bounds.h
+++ b/libcudacxx/include/cuda/__argument/argument_bounds.h
@@ -45,8 +45,9 @@ struct no_bounds
 //! @tparam _Lower The static lower bound.
 //! @tparam _Upper The static upper bound.
 template <auto _Lower, auto _Upper>
-struct static_bounds
+class static_bounds
 {
+public:
   static_assert(::cuda::std::is_same_v<decltype(_Lower), decltype(_Upper)>,
                 "Static bounds endpoints must have the same type");
   static_assert(_Lower <= _Upper, "Lower bound must be <= upper bound");
@@ -74,11 +75,12 @@ inline constexpr bool __is_static_bounds_v<static_bounds<_Lower, _Upper>> = true
 //!
 //! @tparam _Tp The value type of the bounds.
 template <class _Tp>
-struct runtime_bounds
+class runtime_bounds
 {
   _Tp __lower_ = ::cuda::std::numeric_limits<_Tp>::lowest();
   _Tp __upper_ = (::cuda::std::numeric_limits<_Tp>::max)();
 
+public:
   constexpr runtime_bounds() noexcept = default;
 
   _CCCL_API constexpr runtime_bounds(_Tp __lower, _Tp __upper) noexcept
diff --git a/libcudacxx/include/cuda/std/__internal/namespaces.h b/libcudacxx/include/cuda/std/__internal/namespaces.h
index f3cc191adfc..4876dd6f449 100644
--- a/libcudacxx/include/cuda/std/__internal/namespaces.h
+++ b/libcudacxx/include/cuda/std/__internal/namespaces.h
@@ -115,8 +115,8 @@
 #define _CCCL_BEGIN_NAMESPACE_CUDA_EXECUTION _CCCL_BEGIN_NAMESPACE(cuda::execution)
 #define _CCCL_END_NAMESPACE_CUDA_EXECUTION   _CCCL_END_NAMESPACE(cuda::execution)
 
-#define _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT _CCCL_BEGIN_NAMESPACE(cuda::argument)
-#define _CCCL_END_NAMESPACE_CUDA_ARGUMENT   _CCCL_END_NAMESPACE(cuda::argument)
+#define _CCCL_BEGIN_NAMESPACE_CUDA_ARGUMENT _CCCL_BEGIN_NAMESPACE(cuda::args)
+#define _CCCL_END_NAMESPACE_CUDA_ARGUMENT   _CCCL_END_NAMESPACE(cuda::args)
 
 // Namespace to avoid name collisions with CPOs on clang-16 (see
 // https://godbolt.org/z/9TadonrdM for example). MSVC's ancient parser also gets confused with
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
index 6be3c0d925b..46070dbabbd 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_bounds.pass.cpp
@@ -16,31 +16,31 @@
 
 TEST_FUNC constexpr bool test()
 {
-  // --- static_argument_bounds ---
+  // --- static_bounds ---
 
   // Basic static bounds
   {
-    constexpr auto b = cuda::argument::static_bounds<1, 4096>{};
+    constexpr auto b = cuda::args::static_bounds<1, 4096>{};
     static_assert(b.lower() == 1);
     static_assert(b.upper() == 4096);
   }
 
   // Exact static bounds
   {
-    constexpr auto b = cuda::argument::static_bounds<42, 42>{};
+    constexpr auto b = cuda::args::static_bounds<42, 42>{};
     static_assert(b.lower() == 42);
     static_assert(b.upper() == 42);
   }
 
   // Long type deduced from NTTPs
   {
-    static_assert(cuda::std::is_same_v<decltype(cuda::argument::static_bounds<0L, 1000L>::lower()), long>);
+    static_assert(cuda::std::is_same_v<decltype(cuda::args::static_bounds<0L, 1000L>::lower()), long>);
   }
 
 #if TEST_HAS_CLASS_NTTP
   // Static bounds preserve their original NTTP types
   {
-    constexpr auto b = cuda::argument::bounds<1.0f, 8.0f>();
+    constexpr auto b = cuda::args::bounds<1.0f, 8.0f>();
     static_assert(b.lower() == 1.0f);
     static_assert(b.upper() == 8);
     static_assert(cuda::std::is_same_v<decltype(b.lower()), float>);
@@ -48,17 +48,13 @@ TEST_FUNC constexpr bool test()
   }
 #endif // TEST_HAS_CLASS_NTTP
 
-  // --- runtime_argument_bounds ---
+  // --- runtime_bounds ---
 
   // Basic runtime bounds
   {
-    auto b = cuda::argument::runtime_bounds{10, 100};
+    auto b = cuda::args::runtime_bounds{10, 100};
     assert(b.lower() == 10);
     assert(b.upper() == 100);
-    assert(b.__lower_ == 10);
-    assert(b.__upper_ == 100);
-    b.__upper_ = 90;
-    assert(b.upper() == 90);
     static_assert(cuda::std::is_same_v<decltype(b.lower()), int>);
   }
 
@@ -66,43 +62,43 @@ TEST_FUNC constexpr bool test()
 
   // Static via factory
   {
-    constexpr auto b = cuda::argument::bounds<1, 8>();
+    constexpr auto b = cuda::args::bounds<1, 8>();
     static_assert(b.lower() == 1);
     static_assert(b.upper() == 8);
-    static_assert(cuda::argument::__is_static_bounds_cv_v<decltype(b)>);
-    static_assert(!cuda::argument::__is_runtime_bounds_cv_v<decltype(b)>);
-    static_assert(cuda::argument::__is_bounds_v<decltype(b)>);
+    static_assert(cuda::args::__is_static_bounds_cv_v<decltype(b)>);
+    static_assert(!cuda::args::__is_runtime_bounds_cv_v<decltype(b)>);
+    static_assert(cuda::args::__is_bounds_v<decltype(b)>);
   }
 
   // Runtime via factory
   {
-    auto b = cuda::argument::bounds(10, 100);
+    auto b = cuda::args::bounds(10, 100);
     assert(b.lower() == 10);
     assert(b.upper() == 100);
-    static_assert(!cuda::argument::__is_static_bounds_cv_v<decltype(b)>);
-    static_assert(cuda::argument::__is_runtime_bounds_cv_v<decltype(b)>);
-    static_assert(cuda::argument::__is_bounds_v<decltype(b)>);
+    static_assert(!cuda::args::__is_static_bounds_cv_v<decltype(b)>);
+    static_assert(cuda::args::__is_runtime_bounds_cv_v<decltype(b)>);
+    static_assert(cuda::args::__is_bounds_v<decltype(b)>);
   }
 
   // Static and runtime bounds intersection
   {
-    static_assert(cuda::argument::__has_bounds_intersection<int, cuda::argument::static_bounds<1, 100>>(
-      cuda::argument::runtime_bounds<int>{50, 200}));
-    static_assert(!cuda::argument::__has_bounds_intersection<int, cuda::argument::static_bounds<100, 200>>(
-      cuda::argument::runtime_bounds<int>{0, 50}));
+    static_assert(cuda::args::__has_bounds_intersection<int, cuda::args::static_bounds<1, 100>>(
+      cuda::args::runtime_bounds<int>{50, 200}));
+    static_assert(!cuda::args::__has_bounds_intersection<int, cuda::args::static_bounds<100, 200>>(
+      cuda::args::runtime_bounds<int>{0, 50}));
   }
 
   // Non-bounds type
   {
-    static_assert(!cuda::argument::__is_bounds_v<int>);
+    static_assert(!cuda::args::__is_bounds_v<int>);
   }
 
   // Bounds types accepted by argument wrapper template parameters
   {
-    static_assert(cuda::argument::__valid_static_bounds_v<int, cuda::argument::no_bounds>);
-    static_assert(cuda::argument::__valid_static_bounds_v<int, cuda::argument::static_bounds<1, 8>>);
-    static_assert(!cuda::argument::__valid_static_bounds_v<int, cuda::argument::runtime_bounds<int>>);
-    static_assert(!cuda::argument::__valid_static_bounds_v<int, int>);
+    static_assert(cuda::args::__valid_static_bounds_v<int, cuda::args::no_bounds>);
+    static_assert(cuda::args::__valid_static_bounds_v<int, cuda::args::static_bounds<1, 8>>);
+    static_assert(!cuda::args::__valid_static_bounds_v<int, cuda::args::runtime_bounds<int>>);
+    static_assert(!cuda::args::__valid_static_bounds_v<int, int>);
   }
 
   return true;
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp
index 632fe50945a..bf3aec4ed7b 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/argument_traits.pass.cpp
@@ -50,128 +50,123 @@ TEST_FUNC void test()
 {
   // --- __is_sequence_v / __is_single_value_v ---
 
-  static_assert(!cuda::argument::__is_sequence_v<int>);
-  static_assert(!cuda::argument::__is_sequence_v<color>);
-  static_assert(!cuda::argument::__is_sequence_v<non_sequence_value>);
-  static_assert(cuda::argument::__is_sequence_v<int*>);
-  static_assert(cuda::argument::__is_sequence_v<cuda::counting_iterator<int>>);
-  static_assert(cuda::argument::__is_sequence_v<cuda::std::span<int, 1>>);
-  static_assert(cuda::argument::__is_sequence_v<const cuda::std::span<int, 1>&>);
-  static_assert(cuda::argument::__is_sequence_v<cuda::std::span<int>>);
-  static_assert(cuda::argument::__is_sequence_v<cuda::std::array<int, 3>>);
-  static_assert(cuda::argument::__is_sequence_v<range_like<int>>);
-  static_assert(cuda::argument::__is_sequence_v<element_type_like<int>>);
-  static_assert(cuda::argument::__is_sequence_v<value_type_like<int>>);
-
-  static_assert(cuda::argument::__is_single_value_v<int>);
-  static_assert(cuda::argument::__is_single_value_v<float>);
-  static_assert(cuda::argument::__is_single_value_v<double>);
-  static_assert(cuda::argument::__is_single_value_v<const int>);
-  static_assert(cuda::argument::__is_single_value_v<color>);
-  static_assert(cuda::argument::__is_single_value_v<non_sequence_value>);
-  static_assert(!cuda::argument::__is_single_value_v<int*>);
-  static_assert(!cuda::argument::__is_single_value_v<cuda::counting_iterator<int>>);
-  static_assert(!cuda::argument::__is_single_value_v<cuda::std::span<int, 1>>);
-  static_assert(!cuda::argument::__is_single_value_v<const cuda::std::span<int, 1>&>);
-  static_assert(!cuda::argument::__is_single_value_v<cuda::std::span<int>>);
-  static_assert(!cuda::argument::__is_single_value_v<cuda::std::span<int, 4>>);
-  static_assert(!cuda::argument::__is_single_value_v<cuda::std::array<int, 3>>);
+  static_assert(!cuda::args::__is_sequence_v<int>);
+  static_assert(!cuda::args::__is_sequence_v<color>);
+  static_assert(!cuda::args::__is_sequence_v<non_sequence_value>);
+  static_assert(cuda::args::__is_sequence_v<int*>);
+  static_assert(cuda::args::__is_sequence_v<cuda::counting_iterator<int>>);
+  static_assert(cuda::args::__is_sequence_v<cuda::std::span<int, 1>>);
+  static_assert(cuda::args::__is_sequence_v<const cuda::std::span<int, 1>&>);
+  static_assert(cuda::args::__is_sequence_v<cuda::std::span<int>>);
+  static_assert(cuda::args::__is_sequence_v<cuda::std::array<int, 3>>);
+  static_assert(cuda::args::__is_sequence_v<range_like<int>>);
+  static_assert(cuda::args::__is_sequence_v<element_type_like<int>>);
+  static_assert(cuda::args::__is_sequence_v<value_type_like<int>>);
+
+  static_assert(cuda::args::__is_single_value_v<int>);
+  static_assert(cuda::args::__is_single_value_v<float>);
+  static_assert(cuda::args::__is_single_value_v<double>);
+  static_assert(cuda::args::__is_single_value_v<const int>);
+  static_assert(cuda::args::__is_single_value_v<color>);
+  static_assert(cuda::args::__is_single_value_v<non_sequence_value>);
+  static_assert(!cuda::args::__is_single_value_v<int*>);
+  static_assert(!cuda::args::__is_single_value_v<cuda::counting_iterator<int>>);
+  static_assert(!cuda::args::__is_single_value_v<cuda::std::span<int, 1>>);
+  static_assert(!cuda::args::__is_single_value_v<const cuda::std::span<int, 1>&>);
+  static_assert(!cuda::args::__is_single_value_v<cuda::std::span<int>>);
+  static_assert(!cuda::args::__is_single_value_v<cuda::std::span<int, 4>>);
+  static_assert(!cuda::args::__is_single_value_v<cuda::std::array<int, 3>>);
 
   // --- __element_type_of_t ---
 
-  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<const cuda::std::span<int, 1>&>, int>);
-  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<int*>, int>);
-  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<cuda::counting_iterator<int>>, int>);
-  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<cuda::std::array<int, 3>>, int>);
-  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<range_like<int>>, int>);
-  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<element_type_like<int>>, int>);
+  static_assert(cuda::std::is_same_v<cuda::args::__element_type_of_t<const cuda::std::span<int, 1>&>, int>);
+  static_assert(cuda::std::is_same_v<cuda::args::__element_type_of_t<int*>, int>);
+  static_assert(cuda::std::is_same_v<cuda::args::__element_type_of_t<cuda::counting_iterator<int>>, int>);
+  static_assert(cuda::std::is_same_v<cuda::args::__element_type_of_t<cuda::std::array<int, 3>>, int>);
+  static_assert(cuda::std::is_same_v<cuda::args::__element_type_of_t<range_like<int>>, int>);
+  static_assert(cuda::std::is_same_v<cuda::args::__element_type_of_t<element_type_like<int>>, int>);
   static_assert(
-    cuda::std::is_same_v<cuda::argument::__element_type_of_t<cuda::std::mdspan<const int, cuda::std::extents<int, 1>>>,
-                         int>);
-  static_assert(cuda::std::is_same_v<cuda::argument::__element_type_of_t<value_type_like<int>>, int>);
+    cuda::std::is_same_v<cuda::args::__element_type_of_t<cuda::std::mdspan<const int, cuda::std::extents<int, 1>>>, int>);
+  static_assert(cuda::std::is_same_v<cuda::args::__element_type_of_t<value_type_like<int>>, int>);
 
   // --- argument_traits: is_deferred ---
 
-  static_assert(!cuda::argument::__traits<int>::is_deferred);
-  static_assert(!cuda::argument::__traits<cuda::argument::immediate<int>>::is_deferred);
-  static_assert(!cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>::is_deferred);
-  static_assert(!cuda::argument::__traits<cuda::argument::constant<42>>::is_deferred);
+  static_assert(!cuda::args::__traits<int>::is_deferred);
+  static_assert(!cuda::args::__traits<cuda::args::immediate<int>>::is_deferred);
+  static_assert(!cuda::args::__traits<cuda::args::__immediate_sequence<cuda::std::span<int>>>::is_deferred);
+  static_assert(!cuda::args::__traits<cuda::args::constant<42>>::is_deferred);
 #if TEST_HAS_CLASS_NTTP
-  static_assert(
-    !cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_deferred);
+  static_assert(!cuda::args::__traits<cuda::args::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_deferred);
 #endif // TEST_HAS_CLASS_NTTP
-  static_assert(cuda::argument::__traits<cuda::argument::deferred<cuda::std::span<int, 1>>>::is_deferred);
-  static_assert(cuda::argument::__traits<cuda::argument::deferred_sequence<cuda::std::span<int>>>::is_deferred);
+  static_assert(cuda::args::__traits<cuda::args::deferred<cuda::std::span<int, 1>>>::is_deferred);
+  static_assert(cuda::args::__traits<cuda::args::deferred_sequence<cuda::std::span<int>>>::is_deferred);
 
   // --- argument_traits: is_single_value ---
 
-  static_assert(cuda::argument::__traits<int>::is_single_value);
-  static_assert(!cuda::argument::__traits<int*>::is_single_value);
-  static_assert(cuda::argument::__traits<cuda::argument::immediate<int>>::is_single_value);
-  static_assert(cuda::argument::__traits<cuda::argument::immediate<int*>>::is_single_value);
-  static_assert(cuda::argument::__traits<cuda::argument::immediate<cuda::counting_iterator<int>>>::is_single_value);
-  static_assert(!cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>::is_single_value);
-  static_assert(cuda::argument::__traits<cuda::argument::constant<42>>::is_single_value);
+  static_assert(cuda::args::__traits<int>::is_single_value);
+  static_assert(!cuda::args::__traits<int*>::is_single_value);
+  static_assert(cuda::args::__traits<cuda::args::immediate<int>>::is_single_value);
+  static_assert(cuda::args::__traits<cuda::args::immediate<int*>>::is_single_value);
+  static_assert(cuda::args::__traits<cuda::args::immediate<cuda::counting_iterator<int>>>::is_single_value);
+  static_assert(!cuda::args::__traits<cuda::args::__immediate_sequence<cuda::std::span<int>>>::is_single_value);
+  static_assert(cuda::args::__traits<cuda::args::constant<42>>::is_single_value);
 #if TEST_HAS_CLASS_NTTP
   static_assert(
-    !cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
+    !cuda::args::__traits<cuda::args::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
 #endif // TEST_HAS_CLASS_NTTP
-  static_assert(cuda::argument::__traits<cuda::argument::deferred<int*>>::is_single_value);
-  static_assert(!cuda::argument::__traits<cuda::argument::deferred_sequence<cuda::std::span<int>>>::is_single_value);
+  static_assert(cuda::args::__traits<cuda::args::deferred<int*>>::is_single_value);
+  static_assert(!cuda::args::__traits<cuda::args::deferred_sequence<cuda::std::span<int>>>::is_single_value);
 
   // --- argument_traits: value_type ---
 
-  static_assert(cuda::std::is_same_v<cuda::argument::__traits<int>::value_type, int>);
-  static_assert(cuda::std::is_same_v<cuda::argument::__traits<cuda::argument::immediate<int>>::value_type, int>);
+  static_assert(cuda::std::is_same_v<cuda::args::__traits<int>::value_type, int>);
+  static_assert(cuda::std::is_same_v<cuda::args::__traits<cuda::args::immediate<int>>::value_type, int>);
   static_assert(
-    cuda::std::is_same_v<cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>::value_type,
+    cuda::std::is_same_v<cuda::args::__traits<cuda::args::__immediate_sequence<cuda::std::span<int>>>::value_type,
                          cuda::std::span<int>>);
-  static_assert(cuda::std::is_same_v<cuda::argument::__traits<cuda::argument::constant<42>>::value_type, int>);
+  static_assert(cuda::std::is_same_v<cuda::args::__traits<cuda::args::constant<42>>::value_type, int>);
+  static_assert(cuda::std::is_same_v<cuda::args::__traits<cuda::args::constant<10, float>>::value_type, float>);
 #if TEST_HAS_CLASS_NTTP
-  static_assert(
-    cuda::std::is_same_v<
-      cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::value_type,
-      cuda::std::array<int, 3>>);
+  static_assert(cuda::std::is_same_v<
+                cuda::args::__traits<cuda::args::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::value_type,
+                cuda::std::array<int, 3>>);
 #endif // TEST_HAS_CLASS_NTTP
 
   // --- argument_traits: lowest / highest ---
 
-  static_assert(cuda::argument::__traits<int>::lowest == cuda::std::numeric_limits<int>::lowest());
-  static_assert(cuda::argument::__traits<int>::highest == (cuda::std::numeric_limits<int>::max)());
-  static_assert(cuda::argument::__traits<const int>::lowest == cuda::std::numeric_limits<int>::lowest());
-  static_assert(cuda::argument::__traits<int&>::highest == (cuda::std::numeric_limits<int>::max)());
-  static_assert(cuda::argument::__traits<float>::lowest == cuda::std::numeric_limits<float>::lowest());
-  static_assert(cuda::argument::__traits<float>::highest == (cuda::std::numeric_limits<float>::max)());
-  static_assert(
-    cuda::argument::__traits<const cuda::argument::immediate<int, cuda::argument::static_bounds<1, 8>>>::lowest == 1);
+  static_assert(cuda::args::__traits<int>::lowest == cuda::std::numeric_limits<int>::lowest());
+  static_assert(cuda::args::__traits<int>::highest == (cuda::std::numeric_limits<int>::max)());
+  static_assert(cuda::args::__traits<const int>::lowest == cuda::std::numeric_limits<int>::lowest());
+  static_assert(cuda::args::__traits<int&>::highest == (cuda::std::numeric_limits<int>::max)());
+  static_assert(cuda::args::__traits<float>::lowest == cuda::std::numeric_limits<float>::lowest());
+  static_assert(cuda::args::__traits<float>::highest == (cuda::std::numeric_limits<float>::max)());
+  static_assert(cuda::args::__traits<const cuda::args::immediate<int, cuda::args::static_bounds<1, 8>>>::lowest == 1);
+  static_assert(cuda::args::__traits<cuda::args::immediate<int, cuda::args::static_bounds<1, 8>>&>::highest == 8);
   static_assert(
-    cuda::argument::__traits<cuda::argument::immediate<int, cuda::argument::static_bounds<1, 8>>&>::highest == 8);
-  static_assert(
-    cuda::argument::__traits<
-      cuda::argument::immediate_sequence<cuda::std::span<int>, cuda::argument::static_bounds<1, 8>>>::highest
+    cuda::args::__traits<cuda::args::__immediate_sequence<cuda::std::span<int>, cuda::args::static_bounds<1, 8>>>::highest
     == 8);
+  static_assert(cuda::args::__traits<cuda::args::constant<10, float>>::lowest == 10.0f);
+  static_assert(cuda::args::__traits<cuda::args::constant<10, float>>::highest == 10.0f);
 #if TEST_HAS_CLASS_NTTP
-  static_assert(
-    cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{3, 1, 2}>>::lowest == 1);
-  static_assert(
-    cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{3, 1, 2}>>::highest == 3);
+  static_assert(cuda::args::__traits<cuda::args::__constant_sequence<cuda::std::array<int, 3>{3, 1, 2}>>::lowest == 1);
+  static_assert(cuda::args::__traits<cuda::args::__constant_sequence<cuda::std::array<int, 3>{3, 1, 2}>>::highest == 3);
 #endif // TEST_HAS_CLASS_NTTP
 
   // --- Free function bounds on plain values ---
 
-  static_assert(cuda::argument::__lowest_(42) == cuda::std::numeric_limits<int>::lowest());
-  static_assert(cuda::argument::__highest_(42) == (cuda::std::numeric_limits<int>::max)());
-  static_assert(cuda::argument::__lowest_(1.0f) == cuda::std::numeric_limits<float>::lowest());
-  static_assert(cuda::argument::__highest_(1.0f) == (cuda::std::numeric_limits<float>::max)());
+  static_assert(cuda::args::__lowest_(42) == cuda::std::numeric_limits<int>::lowest());
+  static_assert(cuda::args::__highest_(42) == (cuda::std::numeric_limits<int>::max)());
+  static_assert(cuda::args::__lowest_(1.0f) == cuda::std::numeric_limits<float>::lowest());
+  static_assert(cuda::args::__highest_(1.0f) == (cuda::std::numeric_limits<float>::max)());
 
   // --- Scalar and sequence wrappers expose distinct single-value traits ---
 
-  static_assert(cuda::argument::__traits<cuda::argument::constant<42>>::is_single_value);
-  static_assert(cuda::argument::__traits<cuda::argument::immediate<int>>::is_single_value);
-  static_assert(!cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>::is_single_value);
+  static_assert(cuda::args::__traits<cuda::args::constant<42>>::is_single_value);
+  static_assert(cuda::args::__traits<cuda::args::immediate<int>>::is_single_value);
+  static_assert(!cuda::args::__traits<cuda::args::__immediate_sequence<cuda::std::span<int>>>::is_single_value);
 #if TEST_HAS_CLASS_NTTP
   static_assert(
-    !cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
+    !cuda::args::__traits<cuda::args::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
 #endif // TEST_HAS_CLASS_NTTP
 }
 
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp
index 5dc2fb849c7..21b200e8f0f 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_argument.pass.cpp
@@ -22,110 +22,110 @@ TEST_FUNC constexpr bool test()
   // Deferred single value via span<T, 1>
   {
     int val  = 42;
-    auto def = cuda::argument::deferred{cuda::std::span<int, 1>{&val, 1}};
-    assert(cuda::argument::__unwrap(def)[0] == 42);
-    assert(def.__arg_[0] == 42);
-    static_assert(cuda::argument::__traits<decltype(def)>::lowest == cuda::std::numeric_limits<int>::lowest());
-    static_assert(cuda::argument::__traits<decltype(def)>::highest == (cuda::std::numeric_limits<int>::max)());
+    auto def = cuda::args::deferred{cuda::std::span<int, 1>{&val, 1}};
+    assert(cuda::args::__unwrap(def)[0] == 42);
+    assert(cuda::args::__access::__arg(def)[0] == 42);
+    static_assert(cuda::args::__traits<decltype(def)>::lowest == cuda::std::numeric_limits<int>::lowest());
+    static_assert(cuda::args::__traits<decltype(def)>::highest == (cuda::std::numeric_limits<int>::max)());
   }
 
   // Deferred single value with static bounds
   {
     int val  = 42;
-    auto def = cuda::argument::deferred{cuda::std::span<int, 1>{&val, 1}, cuda::argument::bounds<1, 1000>()};
-    assert(cuda::argument::__unwrap(def)[0] == 42);
-    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 1);
-    static_assert(cuda::argument::__traits<decltype(def)>::highest == 1000);
+    auto def = cuda::args::deferred{cuda::std::span<int, 1>{&val, 1}, cuda::args::bounds<1, 1000>()};
+    assert(cuda::args::__unwrap(def)[0] == 42);
+    static_assert(cuda::args::__traits<decltype(def)>::lowest == 1);
+    static_assert(cuda::args::__traits<decltype(def)>::highest == 1000);
   }
 
   // Deferred single value via pointer
   {
     int val     = 42;
-    using def_t = cuda::argument::deferred<int*, cuda::argument::static_bounds<0, 100>>;
-    static_assert(cuda::argument::__traits<def_t>::lowest == 0);
-    static_assert(cuda::argument::__traits<def_t>::highest == 100);
+    using def_t = cuda::args::deferred<int*, cuda::args::static_bounds<0, 100>>;
+    static_assert(cuda::args::__traits<def_t>::lowest == 0);
+    static_assert(cuda::args::__traits<def_t>::highest == 100);
     // Also verify construction works
-    auto def = cuda::argument::deferred{&val, cuda::argument::bounds<0, 100>()};
-    assert(cuda::argument::__unwrap(def) == &val);
+    auto def = cuda::args::deferred{&val, cuda::args::bounds<0, 100>()};
+    assert(cuda::args::__unwrap(def) == &val);
   }
 
   // Deferred single value via fancy iterator
   {
     auto it  = cuda::counting_iterator<int>{42};
-    auto def = cuda::argument::deferred{it, cuda::argument::bounds<0, 100>()};
-    assert(cuda::argument::__unwrap(def)[0] == 42);
-    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 0);
-    static_assert(cuda::argument::__traits<decltype(def)>::highest == 100);
-    static_assert(cuda::argument::__traits<decltype(def)>::is_single_value);
+    auto def = cuda::args::deferred{it, cuda::args::bounds<0, 100>()};
+    assert(cuda::args::__unwrap(def)[0] == 42);
+    static_assert(cuda::args::__traits<decltype(def)>::lowest == 0);
+    static_assert(cuda::args::__traits<decltype(def)>::highest == 100);
+    static_assert(cuda::args::__traits<decltype(def)>::is_single_value);
   }
 
   // Deferred single value with both bounds, runtime bounds first
   {
-    int val  = 42;
-    auto def = cuda::argument::deferred{
-      cuda::std::span<int, 1>{&val, 1}, cuda::argument::bounds(5, 100), cuda::argument::bounds<1, 256>()};
-    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 1);
-    static_assert(cuda::argument::__traits<decltype(def)>::highest == 256);
-    assert(def.__runtime_bounds_.__lower_ == 5);
-    assert(def.__runtime_bounds_.__upper_ == 100);
-    assert(cuda::argument::__lowest_(def) == 5);
-    assert(cuda::argument::__highest_(def) == 100);
-    def.__runtime_bounds_.__upper_ = 90;
-    assert(cuda::argument::__highest_(def) == 90);
+    int val = 42;
+    auto def =
+      cuda::args::deferred{cuda::std::span<int, 1>{&val, 1}, cuda::args::bounds(5, 100), cuda::args::bounds<1, 256>()};
+    static_assert(cuda::args::__traits<decltype(def)>::lowest == 1);
+    static_assert(cuda::args::__traits<decltype(def)>::highest == 256);
+    assert(cuda::args::__access::__runtime_bounds(def).lower() == 5);
+    assert(cuda::args::__access::__runtime_bounds(def).upper() == 100);
+    assert(cuda::args::__lowest_(def) == 5);
+    assert(cuda::args::__highest_(def) == 100);
+    cuda::args::__access::__runtime_bounds(def) = cuda::args::bounds(5, 90);
+    assert(cuda::args::__highest_(def) == 90);
   }
 
   // Deferred sequence via fancy iterator
   {
     auto it  = cuda::counting_iterator<int>{10};
-    auto def = cuda::argument::deferred_sequence{it, cuda::argument::bounds<0, 100>()};
-    assert(cuda::argument::__unwrap(def)[0] == 10);
-    assert(cuda::argument::__unwrap(def)[2] == 12);
-    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 0);
-    static_assert(cuda::argument::__traits<decltype(def)>::highest == 100);
-    static_assert(!cuda::argument::__traits<decltype(def)>::is_single_value);
+    auto def = cuda::args::deferred_sequence{it, cuda::args::bounds<0, 100>()};
+    assert(cuda::args::__unwrap(def)[0] == 10);
+    assert(cuda::args::__unwrap(def)[2] == 12);
+    static_assert(cuda::args::__traits<decltype(def)>::lowest == 0);
+    static_assert(cuda::args::__traits<decltype(def)>::highest == 100);
+    static_assert(!cuda::args::__traits<decltype(def)>::is_single_value);
   }
 
   // Deferred sequence with both bounds
   {
     int arr[4] = {10, 20, 30, 40};
-    auto def   = cuda::argument::deferred_sequence{
-      cuda::std::span<int>{arr, 4}, cuda::argument::bounds<1, 4096>(), cuda::argument::bounds(5, 100)};
-    assert(def.__arg_.size() == 4);
-    assert(def.__runtime_bounds_.__lower_ == 5);
-    assert(def.__runtime_bounds_.__upper_ == 100);
-    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 1);
-    assert(cuda::argument::__lowest_(def) == 5);
-    assert(cuda::argument::__highest_(def) == 100);
+    auto def   = cuda::args::deferred_sequence{
+      cuda::std::span<int>{arr, 4}, cuda::args::bounds<1, 4096>(), cuda::args::bounds(5, 100)};
+    assert(cuda::args::__access::__arg(def).size() == 4);
+    assert(cuda::args::__access::__runtime_bounds(def).lower() == 5);
+    assert(cuda::args::__access::__runtime_bounds(def).upper() == 100);
+    static_assert(cuda::args::__traits<decltype(def)>::lowest == 1);
+    assert(cuda::args::__lowest_(def) == 5);
+    assert(cuda::args::__highest_(def) == 100);
   }
 
   // Deferred sequence with both bounds, runtime bounds first
   {
     int arr[4] = {10, 20, 30, 40};
-    auto def   = cuda::argument::deferred_sequence{
-      cuda::std::span<int>{arr, 4}, cuda::argument::bounds(5, 100), cuda::argument::bounds<1, 4096>()};
-    static_assert(cuda::argument::__traits<decltype(def)>::lowest == 1);
-    static_assert(cuda::argument::__traits<decltype(def)>::highest == 4096);
-    assert(cuda::argument::__lowest_(def) == 5);
-    assert(cuda::argument::__highest_(def) == 100);
+    auto def   = cuda::args::deferred_sequence{
+      cuda::std::span<int>{arr, 4}, cuda::args::bounds(5, 100), cuda::args::bounds<1, 4096>()};
+    static_assert(cuda::args::__traits<decltype(def)>::lowest == 1);
+    static_assert(cuda::args::__traits<decltype(def)>::highest == 4096);
+    assert(cuda::args::__lowest_(def) == 5);
+    assert(cuda::args::__highest_(def) == 100);
   }
 
   // Traits: deferred is single value
   {
-    using traits = cuda::argument::__traits<cuda::argument::deferred<cuda::std::span<int, 1>>>;
+    using traits = cuda::args::__traits<cuda::args::deferred<cuda::std::span<int, 1>>>;
     static_assert(traits::is_deferred);
     static_assert(traits::is_single_value);
   }
 
   // Traits: deferred with pointer is also single value
   {
-    using traits = cuda::argument::__traits<cuda::argument::deferred<int*>>;
+    using traits = cuda::args::__traits<cuda::args::deferred<int*>>;
     static_assert(traits::is_deferred);
     static_assert(traits::is_single_value);
   }
 
   // Traits: deferred_sequence is not single value
   {
-    using traits = cuda::argument::__traits<cuda::argument::deferred_sequence<cuda::std::span<int>>>;
+    using traits = cuda::args::__traits<cuda::args::deferred_sequence<cuda::std::span<int>>>;
     static_assert(traits::is_deferred);
     static_assert(!traits::is_single_value);
   }
@@ -133,16 +133,16 @@ TEST_FUNC constexpr bool test()
   // Unwrap: deferred
   {
     int val  = 99;
-    auto def = cuda::argument::deferred{cuda::std::span<int, 1>{&val, 1}};
-    auto& v  = cuda::argument::__unwrap(def);
+    auto def = cuda::args::deferred{cuda::std::span<int, 1>{&val, 1}};
+    auto& v  = cuda::args::__unwrap(def);
     assert(v[0] == 99);
   }
 
   // Unwrap: deferred_sequence
   {
     int arr[3]    = {10, 20, 30};
-    auto def      = cuda::argument::deferred_sequence{cuda::std::span<int>{arr, 3}};
-    const auto& v = cuda::argument::__unwrap(def);
+    auto def      = cuda::args::deferred_sequence{cuda::std::span<int>{arr, 3}};
+    const auto& v = cuda::args::__unwrap(def);
     assert(v.size() == 3);
     assert(v[1] == 20);
   }
@@ -150,14 +150,14 @@ TEST_FUNC constexpr bool test()
   // Unwrap: rvalue deferred returns by value
   {
     int val = 99;
-    auto v  = cuda::argument::__unwrap(cuda::argument::deferred{cuda::std::span<int, 1>{&val, 1}});
+    auto v  = cuda::args::__unwrap(cuda::args::deferred{cuda::std::span<int, 1>{&val, 1}});
     assert(v[0] == 99);
   }
 
   // Unwrap: rvalue deferred_sequence returns by value
   {
     int arr[3] = {10, 20, 30};
-    auto v     = cuda::argument::__unwrap(cuda::argument::deferred_sequence{cuda::std::span<int>{arr, 3}});
+    auto v     = cuda::args::__unwrap(cuda::args::deferred_sequence{cuda::std::span<int>{arr, 3}});
     assert(v.size() == 3);
     assert(v[2] == 30);
   }
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp
index 1ce371b84e5..64bad620293 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar.fail.cpp
@@ -10,7 +10,7 @@
 
 #include <cuda/argument>
 
-[[maybe_unused]] cuda::argument::deferred_sequence<int> invalid_arg{0};
+[[maybe_unused]] cuda::args::deferred_sequence<int> invalid_arg{0};
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp
index 41a9a2ae778..111bc226ae5 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/deferred_sequence_scalar_traits.fail.cpp
@@ -10,7 +10,7 @@
 
 #include <cuda/argument>
 
-using traits = cuda::argument::__traits<cuda::argument::deferred_sequence<int>>;
+using traits = cuda::args::__traits<cuda::args::deferred_sequence<int>>;
 
 [[maybe_unused]] constexpr bool invalid_traits = traits::is_deferred;
 
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp
index 23e867be616..ef27bdbec39 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/dynamic_argument.pass.cpp
@@ -25,99 +25,99 @@ TEST_FUNC constexpr bool test()
 {
   // Uniform scalar via CTAD
   {
-    auto da = cuda::argument::immediate{5};
-    assert(cuda::argument::__unwrap(da) == 5);
-    assert(da.__arg_ == 5);
-    static_assert(cuda::argument::__traits<decltype(da)>::lowest == cuda::std::numeric_limits<int>::lowest());
-    static_assert(cuda::argument::__traits<decltype(da)>::highest == (cuda::std::numeric_limits<int>::max)());
-    assert(cuda::argument::__lowest_(da) == 5);
-    assert(cuda::argument::__highest_(da) == 5);
-    da.__arg_ = 6;
-    assert(cuda::argument::__unwrap(da) == 6);
+    auto da = cuda::args::immediate{5};
+    assert(cuda::args::__unwrap(da) == 5);
+    assert(cuda::args::__access::__arg(da) == 5);
+    static_assert(cuda::args::__traits<decltype(da)>::lowest == cuda::std::numeric_limits<int>::lowest());
+    static_assert(cuda::args::__traits<decltype(da)>::highest == (cuda::std::numeric_limits<int>::max)());
+    assert(cuda::args::__lowest_(da) == 5);
+    assert(cuda::args::__highest_(da) == 5);
+    cuda::args::__access::__arg(da) = 6;
+    assert(cuda::args::__unwrap(da) == 6);
   }
 
   // Uniform scalar with static bounds
   {
-    auto da = cuda::argument::immediate{5, cuda::argument::bounds<1, 8>()};
-    assert(cuda::argument::__unwrap(da) == 5);
-    static_assert(cuda::argument::__traits<decltype(da)>::lowest == 1);
-    static_assert(cuda::argument::__traits<decltype(da)>::highest == 8);
-    assert(cuda::argument::__lowest_(da) == 5);
-    assert(cuda::argument::__highest_(da) == 5);
+    auto da = cuda::args::immediate{5, cuda::args::bounds<1, 8>()};
+    assert(cuda::args::__unwrap(da) == 5);
+    static_assert(cuda::args::__traits<decltype(da)>::lowest == 1);
+    static_assert(cuda::args::__traits<decltype(da)>::highest == 8);
+    assert(cuda::args::__lowest_(da) == 5);
+    assert(cuda::args::__highest_(da) == 5);
   }
 
   // Non-sequence values are accepted without scalar-only restrictions
   {
-    auto da = cuda::argument::immediate{non_sequence_value{7}};
-    assert(cuda::argument::__unwrap(da).payload == 7);
+    auto da = cuda::args::immediate{non_sequence_value{7}};
+    assert(cuda::args::__unwrap(da).payload == 7);
   }
 
   // Pointer-like types can still represent a single value when explicitly wrapped that way
   {
     int value = 11;
-    auto da   = cuda::argument::immediate{&value};
-    static_assert(cuda::argument::__traits<decltype(da)>::is_single_value);
-    assert(*cuda::argument::__unwrap(da) == 11);
+    auto da   = cuda::args::immediate{&value};
+    static_assert(cuda::args::__traits<decltype(da)>::is_single_value);
+    assert(*cuda::args::__unwrap(da) == 11);
   }
 
   // Per-segment span with runtime bounds
   {
     int arr[4] = {10, 20, 30, 40};
-    auto da    = cuda::argument::immediate_sequence{cuda::std::span<int>{arr, 4}, cuda::argument::bounds(1L, 100L)};
-    assert(cuda::argument::__unwrap(da).size() == 4);
-    assert(da.__arg_.size() == 4);
-    assert(da.__runtime_bounds_.__lower_ == 1);
-    assert(da.__runtime_bounds_.__upper_ == 100);
-    assert(cuda::argument::__lowest_(da) == 1);
-    assert(cuda::argument::__highest_(da) == 100);
-    da.__runtime_bounds_.__upper_ = 90;
-    assert(cuda::argument::__highest_(da) == 90);
+    auto da    = cuda::args::__immediate_sequence{cuda::std::span<int>{arr, 4}, cuda::args::bounds(1L, 100L)};
+    assert(cuda::args::__unwrap(da).size() == 4);
+    assert(cuda::args::__access::__arg(da).size() == 4);
+    assert(cuda::args::__access::__runtime_bounds(da).lower() == 1);
+    assert(cuda::args::__access::__runtime_bounds(da).upper() == 100);
+    assert(cuda::args::__lowest_(da) == 1);
+    assert(cuda::args::__highest_(da) == 100);
+    cuda::args::__access::__runtime_bounds(da) = cuda::args::bounds(1, 90);
+    assert(cuda::args::__highest_(da) == 90);
   }
 
   // Per-segment span with both bounds
   {
     int arr[4] = {10, 20, 30, 40};
-    auto da    = cuda::argument::immediate_sequence{
-      cuda::std::span<int>{arr, 4}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(10, 200)};
-    static_assert(cuda::argument::__traits<decltype(da)>::lowest == 1);
-    static_assert(cuda::argument::__traits<decltype(da)>::highest == 256);
-    assert(cuda::argument::__lowest_(da) == 10);
-    assert(cuda::argument::__highest_(da) == 200);
+    auto da    = cuda::args::__immediate_sequence{
+      cuda::std::span<int>{arr, 4}, cuda::args::bounds<1, 256>(), cuda::args::bounds(10, 200)};
+    static_assert(cuda::args::__traits<decltype(da)>::lowest == 1);
+    static_assert(cuda::args::__traits<decltype(da)>::highest == 256);
+    assert(cuda::args::__lowest_(da) == 10);
+    assert(cuda::args::__highest_(da) == 200);
   }
 
   // Per-segment span with both bounds, runtime bounds first
   {
     int arr[4] = {10, 20, 30, 40};
-    auto da    = cuda::argument::immediate_sequence{
-      cuda::std::span<int>{arr, 4}, cuda::argument::bounds(10, 200), cuda::argument::bounds<1, 256>()};
-    static_assert(cuda::argument::__traits<decltype(da)>::lowest == 1);
-    static_assert(cuda::argument::__traits<decltype(da)>::highest == 256);
-    assert(cuda::argument::__lowest_(da) == 10);
-    assert(cuda::argument::__highest_(da) == 200);
+    auto da    = cuda::args::__immediate_sequence{
+      cuda::std::span<int>{arr, 4}, cuda::args::bounds(10, 200), cuda::args::bounds<1, 256>()};
+    static_assert(cuda::args::__traits<decltype(da)>::lowest == 1);
+    static_assert(cuda::args::__traits<decltype(da)>::highest == 256);
+    assert(cuda::args::__lowest_(da) == 10);
+    assert(cuda::args::__highest_(da) == 200);
   }
 
   // Per-segment via span
   {
     int arr[4] = {1, 2, 3, 4};
-    auto da    = cuda::argument::immediate_sequence{cuda::std::span<int>{arr, 4}};
-    assert(cuda::argument::__unwrap(da).size() == 4);
-    assert(cuda::argument::__unwrap(da)[0] == 1);
-    assert(cuda::argument::__unwrap(da)[3] == 4);
+    auto da    = cuda::args::__immediate_sequence{cuda::std::span<int>{arr, 4}};
+    assert(cuda::args::__unwrap(da).size() == 4);
+    assert(cuda::args::__unwrap(da)[0] == 1);
+    assert(cuda::args::__unwrap(da)[3] == 4);
   }
 
   // Per-segment with static bounds
   {
     int arr[4] = {10, 20, 30, 40};
-    auto da    = cuda::argument::immediate_sequence{cuda::std::span<int>{arr, 4}, cuda::argument::bounds<1, 100>()};
-    assert(cuda::argument::__unwrap(da).size() == 4);
-    assert(cuda::argument::__unwrap(da)[2] == 30);
-    static_assert(cuda::argument::__traits<decltype(da)>::lowest == 1);
-    static_assert(cuda::argument::__traits<decltype(da)>::highest == 100);
+    auto da    = cuda::args::__immediate_sequence{cuda::std::span<int>{arr, 4}, cuda::args::bounds<1, 100>()};
+    assert(cuda::args::__unwrap(da).size() == 4);
+    assert(cuda::args::__unwrap(da)[2] == 30);
+    static_assert(cuda::args::__traits<decltype(da)>::lowest == 1);
+    static_assert(cuda::args::__traits<decltype(da)>::highest == 100);
   }
 
   // Traits
   {
-    using traits = cuda::argument::__traits<cuda::argument::immediate<int>>;
+    using traits = cuda::args::__traits<cuda::args::immediate<int>>;
     static_assert(!traits::is_deferred);
     static_assert(traits::is_single_value);
     static_assert(cuda::std::is_same_v<traits::value_type, int>);
@@ -125,7 +125,7 @@ TEST_FUNC constexpr bool test()
 
   // Sequence traits
   {
-    using traits = cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>;
+    using traits = cuda::args::__traits<cuda::args::__immediate_sequence<cuda::std::span<int>>>;
     static_assert(!traits::is_deferred);
     static_assert(!traits::is_single_value);
     static_assert(cuda::std::is_same_v<traits::value_type, cuda::std::span<int>>);
@@ -133,39 +133,38 @@ TEST_FUNC constexpr bool test()
 
   // __is_single_value_v on unwrapped types
   {
-    static_assert(
-      cuda::argument::__is_single_value_v<cuda::argument::__traits<cuda::argument::immediate<int>>::value_type>);
-    static_assert(!cuda::argument::__traits<cuda::argument::immediate_sequence<cuda::std::span<int>>>::is_single_value);
+    static_assert(cuda::args::__is_single_value_v<cuda::args::__traits<cuda::args::immediate<int>>::value_type>);
+    static_assert(!cuda::args::__traits<cuda::args::__immediate_sequence<cuda::std::span<int>>>::is_single_value);
   }
 
   // Unwrap: scalar
   {
-    auto da = cuda::argument::immediate{7};
-    auto& v = cuda::argument::__unwrap(da);
+    auto da = cuda::args::immediate{7};
+    auto& v = cuda::args::__unwrap(da);
     assert(v == 7);
     v = 8;
-    assert(cuda::argument::__unwrap(da) == 8);
+    assert(cuda::args::__unwrap(da) == 8);
   }
 
   // Unwrap: span
   {
     int arr[3]    = {10, 20, 30};
-    auto da       = cuda::argument::immediate_sequence{cuda::std::span<int>{arr, 3}};
-    const auto& v = cuda::argument::__unwrap(da);
+    auto da       = cuda::args::__immediate_sequence{cuda::std::span<int>{arr, 3}};
+    const auto& v = cuda::args::__unwrap(da);
     assert(v.size() == 3);
     assert(v[1] == 20);
   }
 
   // Unwrap: rvalue scalar returns by value
   {
-    const auto& v = cuda::argument::__unwrap(cuda::argument::immediate{7});
+    const auto& v = cuda::args::__unwrap(cuda::args::immediate{7});
     assert(v == 7);
   }
 
   // Unwrap: rvalue span returns by value
   {
     int arr[3] = {10, 20, 30};
-    auto v     = cuda::argument::__unwrap(cuda::argument::immediate_sequence{cuda::std::span<int>{arr, 3}});
+    auto v     = cuda::args::__unwrap(cuda::args::__immediate_sequence{cuda::std::span<int>{arr, 3}});
     assert(v.size() == 3);
     assert(v[2] == 30);
   }
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp
index 485a17fe72b..df463b0136d 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/static_argument.pass.cpp
@@ -24,72 +24,76 @@ TEST_FUNC void test()
 {
   // Basic value
   {
-    constexpr auto sa = cuda::argument::constant<42>{};
-    static_assert(sa.value() == 42);
+    constexpr auto sa = cuda::args::constant<42>{};
+    static_assert(cuda::args::__unwrap(sa) == 42);
     static_assert(cuda::std::is_same_v<decltype(sa)::value_type, int>);
   }
 
   // Different types
   {
-    constexpr auto sa_long = cuda::argument::constant<100L>{};
-    static_assert(sa_long.value() == 100L);
+    constexpr auto sa_long = cuda::args::constant<100L>{};
+    static_assert(cuda::args::__unwrap(sa_long) == 100L);
     static_assert(cuda::std::is_same_v<decltype(sa_long)::value_type, long>);
+
+    constexpr auto sa_float = cuda::args::constant<10, float>{};
+    static_assert(cuda::args::__unwrap(sa_float) == 10.0f);
+    static_assert(cuda::std::is_same_v<decltype(sa_float)::value_type, float>);
+    static_assert(cuda::std::is_same_v<decltype(cuda::args::__unwrap(sa_float)), float>);
   }
 
   // Negative value
   {
-    constexpr auto sa_neg = cuda::argument::constant<-1>{};
-    static_assert(sa_neg.value() == -1);
+    constexpr auto sa_neg = cuda::args::constant<-1>{};
+    static_assert(cuda::args::__unwrap(sa_neg) == -1);
   }
 
 #if TEST_HAS_CLASS_NTTP
   // Non-sequence values are accepted without scalar-only restrictions
   {
-    constexpr auto sa = cuda::argument::constant<non_sequence_value{7}>{};
-    static_assert(sa.value().payload == 7);
-    static_assert(cuda::argument::__unwrap(sa).payload == 7);
+    constexpr auto sa = cuda::args::constant<non_sequence_value{7}>{};
+    static_assert(cuda::args::__unwrap(sa).payload == 7);
   }
 #endif // TEST_HAS_CLASS_NTTP
 
 #if TEST_HAS_CLASS_NTTP
   // Array sequence
   {
-    constexpr auto sa_arr = cuda::argument::constant_sequence<cuda::std::array<int, 3>{128, 256, 512}>{};
-    static_assert(sa_arr.value()[0] == 128);
-    static_assert(sa_arr.value()[1] == 256);
-    static_assert(sa_arr.value()[2] == 512);
+    constexpr auto sa_arr = cuda::args::__constant_sequence<cuda::std::array<int, 3>{128, 256, 512}>{};
+    static_assert(cuda::args::__unwrap(sa_arr)[0] == 128);
+    static_assert(cuda::args::__unwrap(sa_arr)[1] == 256);
+    static_assert(cuda::args::__unwrap(sa_arr)[2] == 512);
     static_assert(cuda::std::is_same_v<decltype(sa_arr)::value_type, cuda::std::array<int, 3>>);
   }
 #endif // TEST_HAS_CLASS_NTTP
 
   // Bounds: scalar
   {
-    constexpr auto sa = cuda::argument::constant<42>{};
-    static_assert(cuda::argument::__lowest_(sa) == 42);
-    static_assert(cuda::argument::__highest_(sa) == 42);
+    constexpr auto sa = cuda::args::constant<42>{};
+    static_assert(cuda::args::__lowest_(sa) == 42);
+    static_assert(cuda::args::__highest_(sa) == 42);
   }
 
 #if TEST_HAS_CLASS_NTTP
   // Bounds: array sequence computes lowest/highest of elements
   {
-    constexpr auto sa = cuda::argument::constant_sequence<cuda::std::array<int, 3>{128, 256, 512}>{};
-    static_assert(cuda::argument::__lowest_(sa) == 128);
-    static_assert(cuda::argument::__highest_(sa) == 512);
+    constexpr auto sa = cuda::args::__constant_sequence<cuda::std::array<int, 3>{128, 256, 512}>{};
+    static_assert(cuda::args::__lowest_(sa) == 128);
+    static_assert(cuda::args::__highest_(sa) == 512);
   }
 #endif // TEST_HAS_CLASS_NTTP
 
 #if TEST_HAS_CLASS_NTTP
   // Bounds: empty array sequence has unconstrained element bounds
   {
-    constexpr auto sa = cuda::argument::constant_sequence<cuda::std::array<int, 0>{}>{};
-    static_assert(cuda::argument::__lowest_(sa) == cuda::std::numeric_limits<int>::lowest());
-    static_assert(cuda::argument::__highest_(sa) == (cuda::std::numeric_limits<int>::max)());
+    constexpr auto sa = cuda::args::__constant_sequence<cuda::std::array<int, 0>{}>{};
+    static_assert(cuda::args::__lowest_(sa) == cuda::std::numeric_limits<int>::lowest());
+    static_assert(cuda::args::__highest_(sa) == (cuda::std::numeric_limits<int>::max)());
   }
 #endif // TEST_HAS_CLASS_NTTP
 
   // Traits
   {
-    using traits = cuda::argument::__traits<cuda::argument::constant<42>>;
+    using traits = cuda::args::__traits<cuda::args::constant<42>>;
     static_assert(!traits::is_deferred);
     static_assert(traits::is_constant);
     static_assert(traits::is_single_value);
@@ -98,10 +102,22 @@ TEST_FUNC void test()
     static_assert(traits::highest == 42);
   }
 
+  // Traits: explicit constant value type
+  {
+    using traits = cuda::args::__traits<cuda::args::constant<10, float>>;
+    static_assert(!traits::is_deferred);
+    static_assert(traits::is_constant);
+    static_assert(traits::is_single_value);
+    static_assert(cuda::std::is_same_v<traits::value_type, float>);
+    static_assert(cuda::std::is_same_v<traits::element_type, float>);
+    static_assert(traits::lowest == 10.0f);
+    static_assert(traits::highest == 10.0f);
+  }
+
 #if TEST_HAS_CLASS_NTTP
   // Sequence traits
   {
-    using traits = cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>;
+    using traits = cuda::args::__traits<cuda::args::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>;
     static_assert(traits::is_constant);
     static_assert(!traits::is_deferred);
     static_assert(!traits::is_single_value);
@@ -112,26 +128,33 @@ TEST_FUNC void test()
 
   // Single value: scalar is single, sequence is not
   {
-    static_assert(
-      cuda::argument::__is_single_value_v<cuda::argument::__traits<cuda::argument::constant<42>>::value_type>);
+    static_assert(cuda::args::__is_single_value_v<cuda::args::__traits<cuda::args::constant<42>>::value_type>);
 #if TEST_HAS_CLASS_NTTP
     static_assert(
-      !cuda::argument::__traits<cuda::argument::constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
+      !cuda::args::__traits<cuda::args::__constant_sequence<cuda::std::array<int, 3>{1, 2, 3}>>::is_single_value);
 #endif // TEST_HAS_CLASS_NTTP
   }
 
   // Unwrap: scalar
   {
-    constexpr auto sa  = cuda::argument::constant<42>{};
-    constexpr auto val = cuda::argument::__unwrap(sa);
+    constexpr auto sa  = cuda::args::constant<42>{};
+    constexpr auto val = cuda::args::__unwrap(sa);
     static_assert(val == 42);
   }
 
+  // Unwrap: scalar with explicit value type
+  {
+    constexpr auto sa  = cuda::args::constant<10, float>{};
+    constexpr auto val = cuda::args::__unwrap(sa);
+    static_assert(val == 10.0f);
+    static_assert(cuda::std::is_same_v<decltype(val), const float>);
+  }
+
 #if TEST_HAS_CLASS_NTTP
   // Unwrap: sequence
   {
-    constexpr auto sa  = cuda::argument::constant_sequence<cuda::std::array<int, 3>{10, 20, 30}>{};
-    constexpr auto val = cuda::argument::__unwrap(sa);
+    constexpr auto sa  = cuda::args::__constant_sequence<cuda::std::array<int, 3>{10, 20, 30}>{};
+    constexpr auto val = cuda::args::__unwrap(sa);
     static_assert(val[0] == 10);
     static_assert(val[2] == 30);
   }
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp
index 66a1828769b..b59d41fd7a1 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/static_argument_bounds_type.fail.cpp
@@ -10,7 +10,7 @@
 
 #include <cuda/argument>
 
-using arg_t = cuda::argument::immediate<int, cuda::argument::runtime_bounds<int>>;
+using arg_t = cuda::args::immediate<int, cuda::args::runtime_bounds<int>>;
 
 [[maybe_unused]] arg_t invalid_arg{0};
 
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp
index 7f9902e50a2..5212c8a1f9c 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_conversion.fail.cpp
@@ -10,9 +10,9 @@
 
 #include <cuda/argument>
 
-using arg_t = cuda::argument::immediate<unsigned char, cuda::argument::static_bounds<0, 1000>>;
+using arg_t = cuda::args::immediate<unsigned char, cuda::args::static_bounds<0, 1000>>;
 
-[[maybe_unused]] constexpr auto invalid_highest = cuda::argument::__traits<arg_t>::highest;
+[[maybe_unused]] constexpr auto invalid_highest = cuda::args::__traits<arg_t>::highest;
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp
index 8170fdd7ee5..02ba7ecfe96 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/static_bounds_type_mismatch.fail.cpp
@@ -10,7 +10,7 @@
 
 #include <cuda/argument>
 
-[[maybe_unused]] constexpr auto invalid_bounds = cuda::argument::static_bounds<0, 1L>{};
+[[maybe_unused]] constexpr auto invalid_bounds = cuda::args::static_bounds<0, 1L>{};
 
 int main(int, char**)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp b/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp
index f449975cb06..33b193edc8e 100644
--- a/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/argument/usage_example.pass.cpp
@@ -10,7 +10,7 @@
 
 // Integration test: demonstrates how an algorithm consumes argument wrappers
 // to make compile-time and runtime resource decisions.
-// All argument types (plain values, static, dynamic, deferred) work uniformly
+// All argument types (plain values, constants, immediate values, deferred values) work uniformly
 // through the free functions.
 
 #include <cuda/argument>
@@ -34,7 +34,7 @@ enum class algorithm_variant
 template <class _SegSizeArg>
 TEST_FUNC constexpr algorithm_variant select_variant(_SegSizeArg)
 {
-  if constexpr (cuda::argument::__traits<_SegSizeArg>::highest <= shared_memory_capacity)
+  if constexpr (cuda::args::__traits<_SegSizeArg>::highest <= shared_memory_capacity)
   {
     return algorithm_variant::shared_memory;
   }
@@ -48,7 +48,7 @@ TEST_FUNC constexpr algorithm_variant select_variant(_SegSizeArg)
 template <class _SegSizeArg>
 TEST_FUNC constexpr int compute_buffer_size(_SegSizeArg __seg_size, int __num_segments)
 {
-  auto __highest = cuda::std::min(default_max_segment_size, static_cast<int>(cuda::argument::__highest_(__seg_size)));
+  auto __highest = cuda::std::min(default_max_segment_size, static_cast<int>(cuda::args::__highest_(__seg_size)));
   return __highest * __num_segments;
 }
 
@@ -56,9 +56,9 @@ TEST_FUNC constexpr int compute_buffer_size(_SegSizeArg __seg_size, int __num_se
 template <class _SegSizeArg>
 TEST_FUNC constexpr int process_segments(_SegSizeArg __seg_size)
 {
-  const auto& __val = cuda::argument::__unwrap(__seg_size);
+  const auto& __val = cuda::args::__unwrap(__seg_size);
 
-  if constexpr (cuda::argument::__traits<_SegSizeArg>::is_single_value)
+  if constexpr (cuda::args::__traits<_SegSizeArg>::is_single_value)
   {
     return static_cast<int>(__val);
   }
@@ -91,82 +91,82 @@ TEST_FUNC constexpr bool test()
     assert(process_segments(seg) == 64 + 128 + 96);
   }
 
-  // static_argument: scalar, fits in shared memory, buffer = value
+  // constant: scalar, fits in shared memory, buffer = value
   {
-    constexpr auto seg_size = cuda::argument::constant<128>{};
+    constexpr auto seg_size = cuda::args::constant<128>{};
     static_assert(select_variant(seg_size) == algorithm_variant::shared_memory);
     assert(compute_buffer_size(seg_size, 4) == 128 * 4);
     assert(process_segments(seg_size) == 128);
   }
 
 #if TEST_HAS_CLASS_NTTP
-  // static_argument: array sequence, highest fits in shared memory
+  // __constant_sequence: array sequence, highest fits in shared memory
   {
-    constexpr auto seg_sizes = cuda::argument::constant_sequence<cuda::std::array{64, 128, 256}>{};
+    constexpr auto seg_sizes = cuda::args::__constant_sequence<cuda::std::array{64, 128, 256}>{};
     static_assert(select_variant(seg_sizes) == algorithm_variant::shared_memory);
     assert(compute_buffer_size(seg_sizes, 3) == 256 * 3);
     assert(process_segments(seg_sizes) == 64 + 128 + 256);
   }
 
-  // static_argument: array sequence, highest exceeds shared memory, buffer clamped
+  // __constant_sequence: array sequence, highest exceeds shared memory, buffer clamped
   {
-    constexpr auto seg_sizes = cuda::argument::constant_sequence<cuda::std::array{64, 128, 512}>{};
+    constexpr auto seg_sizes = cuda::args::__constant_sequence<cuda::std::array{64, 128, 512}>{};
     static_assert(select_variant(seg_sizes) == algorithm_variant::global_memory);
     assert(compute_buffer_size(seg_sizes, 3) == 512 * 3);
     assert(process_segments(seg_sizes) == 64 + 128 + 512);
   }
 #endif // TEST_HAS_CLASS_NTTP
 
-  // dynamic_argument: tight static bounds, shared memory, buffer = value
+  // immediate: tight static bounds, shared memory, buffer = value
   {
-    constexpr auto seg_size = cuda::argument::immediate{100, cuda::argument::bounds<1, 256>()};
+    constexpr auto seg_size = cuda::args::immediate{100, cuda::args::bounds<1, 256>()};
     static_assert(select_variant(seg_size) == algorithm_variant::shared_memory);
     assert(compute_buffer_size(seg_size, 4) == 100 * 4);
     assert(process_segments(seg_size) == 100);
   }
 
-  // dynamic_argument: wide static bounds, global memory, buffer = value
+  // immediate: wide static bounds, global memory, buffer = value
   {
-    constexpr auto seg_size = cuda::argument::immediate{100, cuda::argument::bounds<1, 4096>()};
+    constexpr auto seg_size = cuda::args::immediate{100, cuda::args::bounds<1, 4096>()};
     static_assert(select_variant(seg_size) == algorithm_variant::global_memory);
     assert(compute_buffer_size(seg_size, 4) == 100 * 4);
     assert(process_segments(seg_size) == 100);
   }
 
-  // dynamic_argument: no bounds, global memory, buffer = value
+  // immediate: no bounds, global memory, buffer = value
   {
-    constexpr auto seg_size = cuda::argument::immediate{100};
+    constexpr auto seg_size = cuda::args::immediate{100};
     static_assert(select_variant(seg_size) == algorithm_variant::global_memory);
     assert(compute_buffer_size(seg_size, 4) == 100 * 4);
     assert(process_segments(seg_size) == 100);
   }
 
-  // dynamic_argument: per-segment span with runtime bounds only
+  // __immediate_sequence: per-segment span with runtime bounds only
   {
     int sizes[3]   = {64, 128, 96};
-    auto seg_sizes = cuda::argument::immediate_sequence{cuda::std::span<int>{sizes, 3}, cuda::argument::bounds(1, 200)};
+    auto seg_sizes = cuda::args::__immediate_sequence{cuda::std::span<int>{sizes, 3}, cuda::args::bounds(1, 200)};
     assert(select_variant(seg_sizes) == algorithm_variant::global_memory);
     assert(compute_buffer_size(seg_sizes, 3) == 200 * 3);
     assert(process_segments(seg_sizes) == 64 + 128 + 96);
   }
 
-  // dynamic_argument: per-segment span with both bounds
+  // __immediate_sequence: per-segment span with both bounds
   {
     int sizes[3]   = {64, 128, 96};
-    auto seg_sizes = cuda::argument::immediate_sequence{
-      cuda::std::span<int>{sizes, 3}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(1, 200)};
-    static_assert(cuda::argument::__traits<decltype(seg_sizes)>::highest <= shared_memory_capacity);
+    auto seg_sizes = cuda::args::__immediate_sequence{
+      cuda::std::span<int>{sizes, 3}, cuda::args::bounds<1, 256>(), cuda::args::bounds(1, 200)};
+    static_assert(cuda::args::__traits<decltype(seg_sizes)>::highest <= shared_memory_capacity);
     assert(select_variant(seg_sizes) == algorithm_variant::shared_memory);
     assert(compute_buffer_size(seg_sizes, 3) == 200 * 3);
     assert(process_segments(seg_sizes) == 64 + 128 + 96);
   }
 
-  // deferred_argument: uniform, bounds for decisions only
+  // deferred: uniform, bounds for decisions only
   {
-    int val       = 100;
-    auto seg_size = cuda::argument::deferred{
-      cuda::std::span<int, 1>{&val, 1}, cuda::argument::bounds<1, 256>(), cuda::argument::bounds(1, 200)};
-    static_assert(cuda::argument::__traits<decltype(seg_size)>::highest <= shared_memory_capacity);
+    int val = 100;
+    auto seg_size =
+      cuda::args::deferred{cuda::std::span<int, 1>{&val, 1}, cuda::args::bounds<1, 256>(), cuda::args::bounds(1, 200)};
+    static_assert(cuda::args::__traits<decltype(seg_size)>::highest <= shared_memory_capacity);
     assert(select_variant(seg_size) == algorithm_variant::shared_memory);
     assert(compute_buffer_size(seg_size, 4) == 200 * 4);
   }
@@ -179,17 +179,24 @@ TEST_FUNC constexpr bool test()
     assert(process_segments(1.0f) == 1);
   }
 
+  // constant float using an integer NTTP and explicit value type
+  {
+    constexpr auto seg_size = cuda::args::constant<128, float>{};
+    static_assert(select_variant(seg_size) == algorithm_variant::shared_memory);
+    assert(process_segments(seg_size) == 128);
+  }
+
 #if TEST_HAS_CLASS_NTTP
-  // static_argument float (float NTTPs require C++20)
+  // constant float (float NTTPs require C++20)
   {
-    constexpr auto seg_size = cuda::argument::constant<128.0f>{};
+    constexpr auto seg_size = cuda::args::constant<128.0f>{};
     static_assert(select_variant(seg_size) == algorithm_variant::shared_memory);
     assert(process_segments(seg_size) == 128);
   }
 
-  // dynamic_argument float with static bounds
+  // immediate float with static bounds
   {
-    constexpr auto seg_size = cuda::argument::immediate{100.0f, cuda::argument::bounds<1.0f, 256.0f>()};
+    constexpr auto seg_size = cuda::args::immediate{100.0f, cuda::args::bounds<1.0f, 256.0f>()};
     static_assert(select_variant(seg_size) == algorithm_variant::shared_memory);
     assert(process_segments(seg_size) == 100);
   }