NVIDIA · pciolkosz · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
@@ -4,7 +4,7 @@
 #include <cub/detail/choose_offset.cuh>
 #include <cub/device/dispatch/dispatch_batched_topk.cuh>
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 #include <cuda/iterator>
 
 #include <nvbench_helper.cuh>
@@ -51,7 +51,7 @@ void fixed_seg_size_topk_keys(
   const auto selected_elements = static_cast<::cuda::std::ptrdiff_t>(MaxNumSelected);
   const auto num_segments      = ::cuda::std::max<std::size_t>(1, (max_elements / segment_size));
   const auto elements          = num_segments * segment_size;
-  const auto total_num_items   = ::cuda::__argument::__immediate{static_cast<::cuda::std::int64_t>(elements)};
+  const auto total_num_items   = ::cuda::args::immediate{static_cast<::cuda::std::int64_t>(elements)};
   const bit_entropy entropy    = str_to_entropy(state.get_string("Entropy"));
 
   // Skip workloads where k exceeds the segment size
@@ -68,9 +68,9 @@ void fixed_seg_size_topk_keys(
   auto d_keys_in      = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_in_ptr), segment_size);
   auto d_keys_out     = cuda::make_strided_iterator(cuda::make_counting_iterator(d_keys_out_ptr), selected_elements);
 
-  auto segment_sizes    = ::cuda::__argument::__constant<MaxSegmentSize>{};
-  auto k                = ::cuda::__argument::__constant<MaxNumSelected>{};
-  auto select_direction = ::cuda::__argument::__constant<cub::detail::topk::select::max>{};
+  auto segment_sizes    = ::cuda::args::constant<MaxSegmentSize>{};
+  auto k                = ::cuda::args::constant<MaxNumSelected>{};
+  auto select_direction = ::cuda::args::constant<cub::detail::topk::select::max>{};
 
   state.add_element_count(elements, "NumElements");
   state.add_element_count(segment_size, "SegmentSize");
@@ -99,7 +99,7 @@ void fixed_seg_size_topk_keys(
       segment_sizes,
       k,
       select_direction,
-      ::cuda::__argument::__immediate{static_cast<::cuda::std::int64_t>(num_segments)},
+      ::cuda::args::immediate{static_cast<::cuda::std::int64_t>(num_segments)},
       total_num_items,
       env);
   });

@@ -8,7 +8,7 @@
 #include <thrust/reduce.h>
 #include <thrust/tabulate.h>
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 #include <cuda/iterator>
 #include <cuda/random>
 #include <cuda/std/algorithm>
@@ -172,17 +172,17 @@ void variable_seg_size_topk_keys(nvbench::state& state,
     static_cast<cuda::std::int64_t>(MaxSegmentSize));
   const auto input_elements  = thrust::reduce(d_segment_sizes.begin(), d_segment_sizes.end());
   const auto output_elements = static_cast<std::size_t>(num_segments) * K;
-  const auto total_num_items = ::cuda::__argument::__immediate{static_cast<cuda::std::int64_t>(input_elements)};
+  const auto total_num_items = ::cuda::args::immediate{static_cast<cuda::std::int64_t>(input_elements)};
 
   auto in_keys_buffer = gen_data<MaxSegmentSize, K>(
     num_segments, string_to_pattern(state.get_string("Pattern")), thrust::raw_pointer_cast(d_segment_sizes.data()));
   auto out_keys_buffer = thrust::device_vector<KeyT>(output_elements, thrust::no_init);
 
-  auto segment_sizes_param = ::cuda::__argument::__immediate_sequence{
-    thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::__argument::__bounds<1, MaxSegmentSize>()};
-  auto k_param            = ::cuda::__argument::__constant<K>{};
-  auto select_direction   = ::cuda::__argument::__constant<cub::detail::topk::select::max>{};
-  auto num_segments_param = ::cuda::__argument::__immediate{static_cast<cuda::std::int64_t>(num_segments)};
+  auto segment_sizes_param = ::cuda::args::__immediate_sequence{
+    thrust::raw_pointer_cast(d_segment_sizes.data()), ::cuda::args::bounds<1, MaxSegmentSize>()};
+  auto k_param            = ::cuda::args::constant<K>{};
+  auto select_direction   = ::cuda::args::constant<cub::detail::topk::select::max>{};
+  auto num_segments_param = ::cuda::args::immediate{static_cast<cuda::std::int64_t>(num_segments)};
 
   auto d_keys_in = cuda::make_strided_iterator(
     cuda::make_counting_iterator(thrust::raw_pointer_cast(in_keys_buffer.data())),

@@ -23,8 +23,8 @@
 #include <cub/device/dispatch/tuning/tuning_batched_topk.cuh>
 #include <cub/util_type.cuh>
 
-#include <cuda/__argument_>
 #include <cuda/__cmath/ceil_div.h>
+#include <cuda/argument>
 
 CUB_NAMESPACE_BEGIN
 
@@ -73,8 +73,8 @@ struct agent_batched_topk_worker_per_segment
   using key_t   = it_value_t<key_it_t>;
   using value_t = it_value_t<value_it_t>;
 
-  using segment_size_val_t = typename ::cuda::__argument::__traits<SegmentSizeParameterT>::element_type;
-  using num_segments_val_t = typename ::cuda::__argument::__traits<NumSegmentsParameterT>::element_type;
+  using segment_size_val_t = typename ::cuda::args::__traits<SegmentSizeParameterT>::element_type;
+  using num_segments_val_t = typename ::cuda::args::__traits<NumSegmentsParameterT>::element_type;
   using counters_t         = batched_topk_counters<num_segments_val_t>;
 
   static constexpr auto policy                 = PolicyGetter{}();
@@ -95,7 +95,7 @@ struct agent_batched_topk_worker_per_segment
     multi_worker_per_segment_policy.threads_per_block * multi_worker_per_segment_policy.items_per_thread;
 
   // Check if there could be large segments present
-  static constexpr bool only_small_segments = ::cuda::__argument::__traits<SegmentSizeParameterT>::highest <= tile_size;
+  static constexpr bool only_small_segments = ::cuda::args::__traits<SegmentSizeParameterT>::highest <= tile_size;
 
   // Check if we are dealing with keys-only or key-value pairs
   static constexpr bool is_keys_only = ::cuda::std::is_same_v<value_t, cub::NullType>;
@@ -196,8 +196,8 @@ struct agent_batched_topk_worker_per_segment
       return;
     }
 
-    constexpr bool is_full_tile = ::cuda::__argument::__traits<SegmentSizeParameterT>::is_constant
-                               && ::cuda::__argument::__traits<SegmentSizeParameterT>::lowest == tile_size;
+    constexpr bool is_full_tile = ::cuda::args::__traits<SegmentSizeParameterT>::is_constant
+                               && ::cuda::args::__traits<SegmentSizeParameterT>::lowest == tile_size;
 
     // Resolve Segment Parameters
     const auto segment_size = params::get_param(segment_sizes, segment_id);

@@ -13,7 +13,7 @@
 #  pragma system_header
 #endif // no system header
 
-#include <cuda/__argument_>
+#include <cuda/argument>
 #include <cuda/std/__type_traits/integral_constant.h>
 #include <cuda/std/__type_traits/remove_cvref.h>
 #include <cuda/std/__utility/forward.h>
@@ -33,10 +33,10 @@ namespace detail::params
 //! @param[in] __index Segment index to read for sequence arguments.
 //! @return The single argument value, or the sequence element at the given index.
 _CCCL_TEMPLATE(class _Tp, class _SegmentIndexT)
-_CCCL_REQUIRES((!::cuda::__argument::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) )
+_CCCL_REQUIRES((!::cuda::args::__is_wrapper_v<::cuda::std::remove_cvref_t<_Tp>>) )
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(_Tp&& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
 {
-  if constexpr (::cuda::__argument::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value)
+  if constexpr (::cuda::args::__traits<::cuda::std::remove_cvref_t<_Tp>>::is_single_value)
   {
     return __arg;
   }
@@ -46,46 +46,46 @@ _CCCL_REQUIRES((!::cuda::__argument::__is_wrapper_v<::cuda::std::remove_cvref_t<
   }
 }
 
-template <auto _Value, class _SegmentIndexT>
+template <auto _Value, class _Tp, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
-get_param(const ::cuda::__argument::__constant<_Value>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
+get_param(const ::cuda::args::constant<_Value, _Tp>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
 {
-  return ::cuda::__argument::__unwrap(__arg);
+  return ::cuda::args::__unwrap(__arg);
 }
 
 template <auto _Value, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
-get_param(const ::cuda::__argument::__constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept
+get_param(const ::cuda::args::__constant_sequence<_Value>& __arg, _SegmentIndexT __index) noexcept
 {
-  return ::cuda::__argument::__unwrap(__arg)[__index];
+  return ::cuda::args::__unwrap(__arg)[__index];
 }
 
 template <class _Arg, class _StaticBounds, class _SegmentIndexT>
-[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(
-  const ::cuda::__argument::__immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
+[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
+get_param(const ::cuda::args::immediate<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
 {
-  return ::cuda::__argument::__unwrap(__arg);
+  return ::cuda::args::__unwrap(__arg);
 }
 
 template <class _Arg, class _StaticBounds, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
-get_param(const ::cuda::__argument::__immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
+get_param(const ::cuda::args::__immediate_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
 {
-  return ::cuda::__argument::__unwrap(__arg)[__index];
+  return ::cuda::args::__unwrap(__arg)[__index];
 }
 
 template <class _Arg, class _StaticBounds, class _SegmentIndexT>
-[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto get_param(
-  const ::cuda::__argument::__deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
+[[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
+get_param(const ::cuda::args::deferred<_Arg, _StaticBounds>& __arg, [[maybe_unused]] _SegmentIndexT __index) noexcept
 {
-  return ::cuda::__argument::__unwrap(__arg);
+  return ::cuda::args::__unwrap(__arg);
 }
 
 template <class _Arg, class _StaticBounds, class _SegmentIndexT>
 [[nodiscard]] _CCCL_HOST_DEVICE constexpr auto
-get_param(const ::cuda::__argument::__deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
+get_param(const ::cuda::args::deferred_sequence<_Arg, _StaticBounds>& __arg, _SegmentIndexT __index) noexcept
 {
-  return ::cuda::__argument::__unwrap(__arg)[__index];
+  return ::cuda::args::__unwrap(__arg)[__index];
 }
 
 // =====================================================================

@@ -31,10 +31,10 @@
 
 #include <thrust/system/cuda/detail/core/triple_chevron_launch.h>
 
-#include <cuda/__argument_>
 #include <cuda/__cmath/ceil_div.h>
 #include <cuda/__iterator/counting_iterator.h>
 #include <cuda/__iterator/transform_iterator.h>
+#include <cuda/argument>
 #include <cuda/std/__functional/operations.h>
 #include <cuda/std/__type_traits/is_same.h>
 #include <cuda/std/__type_traits/remove_cv.h>
@@ -49,9 +49,9 @@ namespace detail::batched_topk
 // Internal: wrap user-facing select direction into discrete param for dispatch
 // -----------------------------------------------------------------------------
 
-// Uniform (compile-time): __constant<Dir> -> single-option uniform_discrete_param.
-template <detail::topk::select Dir>
-[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::__argument::__constant<Dir>)
+// Uniform (compile-time): constant<Dir> -> single-option uniform_discrete_param.
+template <detail::topk::select Dir, class _Tp>
+[[nodiscard]] _CCCL_HOST_DEVICE auto wrap_select_direction(::cuda::args::constant<Dir, _Tp>)
 {
   return params::uniform_discrete_param<detail::topk::select, Dir>{Dir};
 }
@@ -126,7 +126,7 @@ template <typename KeyInputItItT,
           typename PolicySelector = policy_selector_from_types<it_value_t<it_value_t<KeyInputItItT>>,
                                                                it_value_t<it_value_t<ValueInputItItT>>,
                                                                ::cuda::std::int64_t,
-                                                               ::cuda::__argument::__traits<KParameterT>::highest>>
+                                                               ::cuda::args::__traits<KParameterT>::highest>>
 #if _CCCL_HAS_CONCEPTS()
   requires batched_topk_policy_selector<PolicySelector>
 #endif // _CCCL_HAS_CONCEPTS()
@@ -145,7 +145,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   cudaStream_t stream                             = nullptr,
   [[maybe_unused]] PolicySelector policy_selector = {})
 {
-  using large_segment_tile_offset_t = typename ::cuda::__argument::__traits<TotalNumItemsGuaranteeT>::element_type;
+  using large_segment_tile_offset_t = typename ::cuda::args::__traits<TotalNumItemsGuaranteeT>::element_type;
 
   // Wrap the raw enum into the internal discrete param type
   auto select_directions          = wrap_select_direction(select_direction);
@@ -171,9 +171,9 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   static constexpr int worker_per_segment_tile_size =
     worker_per_segment_policy.threads_per_block * worker_per_segment_policy.items_per_thread;
   static constexpr bool any_small_segments =
-    ::cuda::__argument::__traits<SegmentSizeParameterT>::lowest <= worker_per_segment_tile_size;
+    ::cuda::args::__traits<SegmentSizeParameterT>::lowest <= worker_per_segment_tile_size;
   static constexpr bool only_small_segments =
-    ::cuda::__argument::__traits<SegmentSizeParameterT>::highest <= worker_per_segment_tile_size;
+    ::cuda::args::__traits<SegmentSizeParameterT>::highest <= worker_per_segment_tile_size;
 
   // Allocation layout:
   //   only_small_segments: [0] dummy.
@@ -183,7 +183,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
   static constexpr int allocations_array_size     = only_small_segments ? 1 : (any_small_segments ? 3 : 2);
   size_t allocation_sizes[allocations_array_size] = {1};
 
-  using num_segments_val_t         = typename ::cuda::__argument::__traits<NumSegmentsParameterT>::element_type;
+  using num_segments_val_t         = typename ::cuda::args::__traits<NumSegmentsParameterT>::element_type;
   using counters_t                 = batched_topk_counters<num_segments_val_t>;
   using segment_size_scan_offset_t = detail::choose_offset_t<num_segments_val_t>;
   using segment_size_scan_input_op_t =
@@ -239,7 +239,7 @@ CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE cudaError_t dispatch(
 
   // TODO (elstehle): support number of segments provided by device-accessible iterator
   // Only uniform number of segments are supported (i.e., we need to resolve the number of segments on the host)
-  static_assert(::cuda::__argument::__traits<NumSegmentsParameterT>::is_single_value,
+  static_assert(::cuda::args::__traits<NumSegmentsParameterT>::is_single_value,
                 "Only uniform segment sizes are currently supported.");
 
   if constexpr (any_small_segments)
@@ -341,7 +341,7 @@ template <typename KeyInputItItT,
     policy_selector_from_types<it_value_t<it_value_t<KeyInputItItT>>,
                                it_value_t<it_value_t<ValueInputItItT>>,
                                ::cuda::std::int64_t,
-                               ::cuda::__argument::__traits<KParameterT>::highest>;
+                               ::cuda::args::__traits<KParameterT>::highest>;
   return detail::dispatch_with_env_and_tuning<default_policy_selector>(
     env, [&](auto policy_selector, void* d_temp_storage, size_t& temp_storage_bytes, cudaStream_t stream) {
       return dispatch(

@@ -20,8 +20,8 @@
 #include <cub/device/dispatch/tuning/tuning_batched_topk.cuh>
 #include <cub/util_arch.cuh>
 
-#include <cuda/__argument_>
 #include <cuda/__device/compute_capability.h>
+#include <cuda/argument>
 
 CUB_NAMESPACE_BEGIN
 
@@ -39,7 +39,7 @@ private:
     worker_policy worker_per_segment_policy;
     multi_worker_policy multi_worker_per_segment_policy;
   };
-  static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::__argument::__traits<SegmentSizeParameterT>::highest;
+  static constexpr ::cuda::std::int64_t max_segment_size = ::cuda::args::__traits<SegmentSizeParameterT>::highest;
   static constexpr batched_topk_policy active_policy     = current_policy<PolicySelector>();
 
   template <int Index>
@@ -133,8 +133,8 @@ __launch_bounds__(int(
     KParameterT k,
     SelectDirectionParameterT select_directions,
     NumSegmentsParameterT num_segments,
-    batched_topk_counters<typename ::cuda::__argument::__traits<NumSegmentsParameterT>::element_type>* d_counters,
-    typename ::cuda::__argument::__traits<NumSegmentsParameterT>::element_type* d_large_segments_ids,
+    batched_topk_counters<typename ::cuda::args::__traits<NumSegmentsParameterT>::element_type>* d_counters,
+    typename ::cuda::args::__traits<NumSegmentsParameterT>::element_type* d_large_segments_ids,
     LargeSegmentTileOffsetT* d_large_segments_tile_offsets)
 {
   using agent_t = typename find_smallest_covering_policy<
@@ -151,7 +151,7 @@ __launch_bounds__(int(
     LargeSegmentTileOffsetT>::agent_t;
 
   // Static Assertions (Constraints)
-  static_assert(agent_t::tile_size >= ::cuda::__argument::__traits<SegmentSizeParameterT>::highest,
+  static_assert(agent_t::tile_size >= ::cuda::args::__traits<SegmentSizeParameterT>::highest,
                 "Block size exceeds maximum segment size supported by SegmentSizeParameterT");
   static_assert(sizeof(typename agent_t::TempStorage) <= max_smem_per_block,
                 "Static shared memory per block must not exceed 48KB limit.");

@@ -151,11 +151,11 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small fixed-size segments",
   batched_topk_keys(
     d_keys_in,
     d_keys_out,
-    ::cuda::__argument::__immediate{segment_size, ::cuda::__argument::__bounds<segment_size_t{1}, max_segment_size>()},
-    ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds<segment_size_t{1}, static_max_k>()},
+    ::cuda::args::immediate{segment_size, ::cuda::args::bounds<segment_size_t{1}, max_segment_size>()},
+    ::cuda::args::immediate{k, ::cuda::args::bounds<segment_size_t{1}, static_max_k>()},
     direction,
-    ::cuda::__argument::__immediate{num_segments},
-    ::cuda::__argument::__immediate{num_segments * segment_size});
+    ::cuda::args::immediate{num_segments},
+    ::cuda::args::immediate{num_segments * segment_size});
   // Prepare expected results
   fixed_size_segmented_sort_keys(expected_keys, num_segments, segment_size, direction);
   compact_sorted_keys_to_topk(expected_keys, segment_size, k);
@@ -248,12 +248,12 @@ C2H_TEST("DeviceBatchedTopK::{Min,Max}Keys work with small variable-size segment
   batched_topk_keys(
     d_keys_in,
     d_keys_out,
-    ::cuda::__argument::__immediate_sequence{
-      segment_size_it, ::cuda::__argument::__bounds<segment_size_t{1}, static_max_segment_size>()},
-    ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds<segment_size_t{1}, static_max_k>()},
+    ::cuda::args::__immediate_sequence{
+      segment_size_it, ::cuda::args::bounds<segment_size_t{1}, static_max_segment_size>()},
+    ::cuda::args::immediate{k, ::cuda::args::bounds<segment_size_t{1}, static_max_k>()},
     direction,
-    ::cuda::__argument::__immediate{num_segments},
-    ::cuda::__argument::__immediate{num_items});
+    ::cuda::args::immediate{num_segments},
+    ::cuda::args::immediate{num_items});
 
   // Verify keys are returned correctly: sort each segment of the expected input, then compact the top-k
   segmented_sort_keys(expected_keys, num_segments, segment_offsets.cbegin(), segment_offsets.cbegin() + 1, direction);
@@ -286,12 +286,11 @@ C2H_TEST("DeviceBatchedTopK::MinKeys preserves -0.0f in output", "[keys][segment
   batched_topk_keys(
     d_keys_in_it,
     d_keys_out_it,
-    ::cuda::__argument::__immediate{
-      segment_size, ::cuda::__argument::__bounds<cuda::std::int64_t{1}, max_segment_size>()},
-    ::cuda::__argument::__immediate{k, ::cuda::__argument::__bounds<cuda::std::int64_t{1}, k>()},
+    ::cuda::args::immediate{segment_size, ::cuda::args::bounds<cuda::std::int64_t{1}, max_segment_size>()},
+    ::cuda::args::immediate{k, ::cuda::args::bounds<cuda::std::int64_t{1}, k>()},
     cub::detail::topk::select::min,
-    ::cuda::__argument::__immediate{num_segments},
-    ::cuda::__argument::__immediate{num_segments * segment_size});
+    ::cuda::args::immediate{num_segments},
+    ::cuda::args::immediate{num_segments * segment_size});
 
   const int num_minus_zero = static_cast<int>(thrust::count_if(d_keys_out.begin(), d_keys_out.end(), is_minus_zero{}));
   REQUIRE(num_minus_zero >= 1);