From ac143a8ca68f86768723e08c56e935681c37aaef Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Fri, 5 Jun 2026 01:38:42 -0700
Subject: [PATCH 1/4] adds max_total_num_items guarantee

---
 .../include/cuda/__execution/guarantee.h      |  81 ++++++++++
 .../cuda/__execution/max_total_num_items.h    | 140 ++++++++++++++++++
 libcudacxx/include/cuda/execution             |   2 +
 libcudacxx/include/cuda/execution.guarantee.h |  26 ++++
 .../cuda/execution.max_total_num_items.h      |  26 ++++
 .../cuda/execution/guarantee.fail.cpp         |  29 ++++
 .../cuda/execution/guarantee.pass.cpp         |  40 +++++
 .../execution/max_total_num_items.fail.cpp    |  24 +++
 .../execution/max_total_num_items.pass.cpp    |  84 +++++++++++
 9 files changed, 452 insertions(+)
 create mode 100644 libcudacxx/include/cuda/__execution/guarantee.h
 create mode 100644 libcudacxx/include/cuda/__execution/max_total_num_items.h
 create mode 100644 libcudacxx/include/cuda/execution.guarantee.h
 create mode 100644 libcudacxx/include/cuda/execution.max_total_num_items.h
 create mode 100644 libcudacxx/test/libcudacxx/cuda/execution/guarantee.fail.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/execution/guarantee.pass.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/execution/max_total_num_items.fail.cpp
 create mode 100644 libcudacxx/test/libcudacxx/cuda/execution/max_total_num_items.pass.cpp

diff --git a/libcudacxx/include/cuda/__execution/guarantee.h b/libcudacxx/include/cuda/__execution/guarantee.h
new file mode 100644
index 00000000000..485a096ff96
--- /dev/null
+++ b/libcudacxx/include/cuda/__execution/guarantee.h
@@ -0,0 +1,81 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDA___EXECUTION_GUARANTEE_H
+#define __CUDA___EXECUTION_GUARANTEE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__execution/env.h>
+#include <cuda/std/__type_traits/is_base_of.h>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_EXECUTION
+
+//! @brief Base class of all guarantees that can be passed to @c cuda::execution::guarantee.
+//!
+//! A guarantee is a promise that the caller makes to an algorithm about its input or the problem being solved (e.g. an
+//! upper bound on the total number of items). Algorithms may exploit guarantees to select faster code paths or smaller
+//! intermediate types. This is the dual of @c cuda::execution::__requirement, which describes a property that the caller
+//! demands from the algorithm. Unlike requirements, guarantees may be stateful, i.e. they may carry a runtime value.
+class __guarantee
+{};
+
+struct __get_guarantees_t
+{
+  _CCCL_EXEC_CHECK_DISABLE
+  _CCCL_TEMPLATE(class _Env)
+  _CCCL_REQUIRES(::cuda::std::execution::__queryable_with<_Env, __get_guarantees_t>)
+  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(const _Env& __env) const noexcept
+  {
+    static_assert(noexcept(__env.query(*this)));
+    return __env.query(*this);
+  }
+
+  [[nodiscard]]
+  _CCCL_NODEBUG_API static constexpr auto query(::cuda::std::execution::forwarding_query_t) noexcept -> bool
+  {
+    return true;
+  }
+};
+
+_CCCL_GLOBAL_CONSTANT auto __get_guarantees = __get_guarantees_t{};
+
+//! @brief Bundles a pack of guarantees into an environment that can be passed to device-wide parallel algorithms.
+//!
+//! The returned property is keyed by @c __get_guarantees_t so that individual guarantees are only visible to algorithms
+//! through the guarantees environment, mirroring how @c cuda::execution::require exposes requirements. Each guarantee is
+//! stored by value, preserving any runtime state it carries.
+template <class... _Guarantees>
+[[nodiscard]] _CCCL_NODEBUG_API auto guarantee(_Guarantees... __guarantees)
+{
+  static_assert((::cuda::std::is_base_of_v<__guarantee, _Guarantees> && ...),
+                "Only guarantees can be passed to guarantee");
+
+  ::cuda::std::execution::env<_Guarantees...> __env{__guarantees...};
+
+  return ::cuda::std::execution::prop{__get_guarantees_t{}, __env};
+}
+
+_CCCL_END_NAMESPACE_CUDA_EXECUTION
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // __CUDA___EXECUTION_GUARANTEE_H
diff --git a/libcudacxx/include/cuda/__execution/max_total_num_items.h b/libcudacxx/include/cuda/__execution/max_total_num_items.h
new file mode 100644
index 00000000000..23df63a540c
--- /dev/null
+++ b/libcudacxx/include/cuda/__execution/max_total_num_items.h
@@ -0,0 +1,140 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CUDA___EXECUTION_MAX_TOTAL_NUM_ITEMS_H
+#define __CUDA___EXECUTION_MAX_TOTAL_NUM_ITEMS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__execution/guarantee.h>
+#include <cuda/std/__concepts/concept_macros.h>
+#include <cuda/std/__execution/env.h>
+#include <cuda/std/__type_traits/is_integral.h>
+#include <cuda/std/__utility/cmp.h>
+#include <cuda/std/limits>
+
+#include <cuda/std/__cccl/prologue.h>
+
+_CCCL_BEGIN_NAMESPACE_CUDA_EXECUTION
+
+//! @brief Guarantee describing an upper bound on the total number of items processed by an algorithm (e.g. the combined
+//! size of all segments handled by cub::DeviceBatchedTopK).
+//!
+//! The bound is carried as an integral value whose type is inferred from the argument; that type distinguishes, for
+//! example, a 32-bit from a 64-bit bound and lets algorithms size intermediate offset types accordingly. The bound can
+//! be expressed as a compile-time bound (@c static_highest), a runtime bound (@c highest()), or both. A composable
+//! @c min_total_num_items lower-bound guarantee may be added in the future.
+struct __get_max_total_num_items_t;
+
+template <class _Tp, _Tp _StaticHighest>
+struct _CCCL_DECLSPEC_EMPTY_BASES __max_total_num_items_holder_t : __guarantee
+{
+  static_assert(::cuda::std::is_integral_v<_Tp>, "max_total_num_items requires an integral bound type");
+
+  using element_type = _Tp;
+
+  static constexpr element_type static_highest = _StaticHighest;
+
+  element_type __highest_;
+
+  //! @brief Returns the effective (runtime) upper bound on the total number of items.
+  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto highest() const noexcept -> element_type
+  {
+    return __highest_;
+  }
+
+  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto query(const __get_max_total_num_items_t&) const noexcept
+    -> const __max_total_num_items_holder_t&
+  {
+    return *this;
+  }
+};
+
+struct __get_max_total_num_items_t
+{
+  _CCCL_EXEC_CHECK_DISABLE
+  _CCCL_TEMPLATE(class _Env)
+  _CCCL_REQUIRES(::cuda::std::execution::__queryable_with<_Env, __get_max_total_num_items_t>)
+  [[nodiscard]] _CCCL_NODEBUG_API constexpr auto operator()(const _Env& __env) const noexcept
+  {
+    static_assert(noexcept(__env.query(*this)));
+    return __env.query(*this);
+  }
+
+  [[nodiscard]]
+  _CCCL_NODEBUG_API static constexpr auto query(::cuda::std::execution::forwarding_query_t) noexcept -> bool
+  {
+    return true;
+  }
+};
+
+_CCCL_GLOBAL_CONSTANT auto __get_max_total_num_items = __get_max_total_num_items_t{};
+
+//! @brief Creates a guarantee with a compile-time upper bound on the total number of items.
+//!
+//! The bound type is inferred from the non-type template parameter, which must be integral.
+//!
+//! @tparam _Highest Compile-time upper bound on the total number of items.
+//! @return A guarantee that can be passed to @c cuda::execution::guarantee.
+template <auto _Highest>
+[[nodiscard]] _CCCL_NODEBUG_API constexpr auto max_total_num_items() noexcept
+  -> __max_total_num_items_holder_t<decltype(_Highest), _Highest>
+{
+  static_assert(::cuda::std::is_integral_v<decltype(_Highest)>, "max_total_num_items requires an integral bound");
+  return __max_total_num_items_holder_t<decltype(_Highest), _Highest>{{}, _Highest};
+}
+
+//! @brief Creates a guarantee with a runtime upper bound on the total number of items.
+//!
+//! The bound type is inferred from the argument, which must be integral. The compile-time bound spans the whole type.
+//!
+//! @param __highest Runtime upper bound on the total number of items.
+//! @return A guarantee that can be passed to @c cuda::execution::guarantee.
+_CCCL_TEMPLATE(class _Tp)
+_CCCL_REQUIRES(::cuda::std::is_integral_v<_Tp>)
+[[nodiscard]] _CCCL_NODEBUG_API constexpr auto max_total_num_items(_Tp __highest) noexcept
+  -> __max_total_num_items_holder_t<_Tp, (::cuda::std::numeric_limits<_Tp>::max)()>
+{
+  return __max_total_num_items_holder_t<_Tp, (::cuda::std::numeric_limits<_Tp>::max)()>{{}, __highest};
+}
+
+//! @brief Creates a guarantee with both a compile-time and a runtime upper bound on the total number of items.
+//!
+//! The bound type is inferred from the non-type template parameter. The runtime bound must not exceed the compile-time
+//! bound.
+//!
+//! @tparam _Highest Compile-time upper bound on the total number of items.
+//! @param __highest Runtime upper bound on the total number of items, must be `<= _Highest`.
+//! @return A guarantee that can be passed to @c cuda::execution::guarantee.
+template <auto _Highest, class _Tp>
+[[nodiscard]] _CCCL_NODEBUG_API constexpr auto max_total_num_items(_Tp __highest) noexcept
+  -> __max_total_num_items_holder_t<decltype(_Highest), _Highest>
+{
+  static_assert(::cuda::std::is_integral_v<decltype(_Highest)>,
+                "max_total_num_items requires an integral static bound");
+  static_assert(::cuda::std::is_integral_v<_Tp>, "max_total_num_items requires an integral runtime bound");
+  _CCCL_ASSERT(::cuda::std::cmp_less_equal(__highest, _Highest),
+               "max_total_num_items: the runtime bound must not exceed the static bound");
+  return __max_total_num_items_holder_t<decltype(_Highest), _Highest>{{}, static_cast<decltype(_Highest)>(__highest)};
+}
+
+_CCCL_END_NAMESPACE_CUDA_EXECUTION
+
+#include <cuda/std/__cccl/epilogue.h>
+
+#endif // __CUDA___EXECUTION_MAX_TOTAL_NUM_ITEMS_H
diff --git a/libcudacxx/include/cuda/execution b/libcudacxx/include/cuda/execution
index dfb698bcb4b..c9fafee22ea 100644
--- a/libcudacxx/include/cuda/execution
+++ b/libcudacxx/include/cuda/execution
@@ -22,6 +22,8 @@
 #endif // no system header
 
 #include <cuda/__execution/determinism.h>
+#include <cuda/__execution/guarantee.h>
+#include <cuda/__execution/max_total_num_items.h>
 #include <cuda/__execution/output_ordering.h>
 #include <cuda/__execution/policy.h>
 #include <cuda/__execution/require.h>
diff --git a/libcudacxx/include/cuda/execution.guarantee.h b/libcudacxx/include/cuda/execution.guarantee.h
new file mode 100644
index 00000000000..3f891675c34
--- /dev/null
+++ b/libcudacxx/include/cuda/execution.guarantee.h
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_EXECUTION_EXECUTION_GUARANTEE_H
+#define _CUDA_EXECUTION_EXECUTION_GUARANTEE_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__execution/guarantee.h>
+
+#endif // _CUDA_EXECUTION_EXECUTION_GUARANTEE_H
diff --git a/libcudacxx/include/cuda/execution.max_total_num_items.h b/libcudacxx/include/cuda/execution.max_total_num_items.h
new file mode 100644
index 00000000000..8e0570d4cfd
--- /dev/null
+++ b/libcudacxx/include/cuda/execution.max_total_num_items.h
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _CUDA_EXECUTION_EXECUTION_MAX_TOTAL_NUM_ITEMS_H
+#define _CUDA_EXECUTION_EXECUTION_MAX_TOTAL_NUM_ITEMS_H
+
+#include <cuda/std/detail/__config>
+
+#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
+#  pragma GCC system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
+#  pragma clang system_header
+#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
+#  pragma system_header
+#endif // no system header
+
+#include <cuda/__execution/max_total_num_items.h>
+
+#endif // _CUDA_EXECUTION_EXECUTION_MAX_TOTAL_NUM_ITEMS_H
diff --git a/libcudacxx/test/libcudacxx/cuda/execution/guarantee.fail.cpp b/libcudacxx/test/libcudacxx/cuda/execution/guarantee.fail.cpp
new file mode 100644
index 00000000000..b1301ec12e2
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/execution/guarantee.fail.cpp
@@ -0,0 +1,29 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/execution.guarantee.h>
+
+[[maybe_unused]] _CCCL_GLOBAL_CONSTANT struct query_t
+{
+} query{};
+
+TEST_FUNC void test()
+{
+  // not every environment is a guarantee
+  cuda::std::execution::prop p{query, 42};
+  cuda::execution::guarantee(p);
+}
+
+int main(int, char**)
+{
+  test();
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/execution/guarantee.pass.cpp b/libcudacxx/test/libcudacxx/cuda/execution/guarantee.pass.cpp
new file mode 100644
index 00000000000..8eb43d3baee
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/execution/guarantee.pass.cpp
@@ -0,0 +1,40 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/execution.guarantee.h>
+#include <cuda/execution.max_total_num_items.h>
+
+#include <cuda/std/cassert>
+#include <cuda/std/cstdint>
+#include <cuda/std/type_traits>
+
+#include "test_macros.h"
+
+TEST_FUNC void test()
+{
+  namespace exec = cuda::execution;
+
+  // A guarantee is only visible to an algorithm through the guarantees environment produced by guarantee(...), mirroring
+  // how requirements are only visible through the requirements environment produced by require(...).
+  const auto genv     = exec::guarantee(exec::max_total_num_items<1000>());
+  const auto resolved = exec::__get_max_total_num_items(exec::__get_guarantees(genv));
+  static_assert(cuda::std::is_base_of_v<exec::__guarantee, cuda::std::remove_cvref_t<decltype(resolved)>>);
+  assert(resolved.highest() == 1000);
+
+  // The guarantees query is a forwarding query, just like the requirements query.
+  static_assert(cuda::std::execution::forwarding_query(exec::__get_guarantees_t{}));
+}
+
+int main(int, char**)
+{
+  test();
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/execution/max_total_num_items.fail.cpp b/libcudacxx/test/libcudacxx/cuda/execution/max_total_num_items.fail.cpp
new file mode 100644
index 00000000000..d9c4d398fc2
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/execution/max_total_num_items.fail.cpp
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/execution.max_total_num_items.h>
+
+TEST_FUNC void test()
+{
+  // The bound must be of integral type: a floating-point argument has no viable overload.
+  [[maybe_unused]] auto guarantee = cuda::execution::max_total_num_items(1.5);
+}
+
+int main(int, char**)
+{
+  test();
+
+  return 0;
+}
diff --git a/libcudacxx/test/libcudacxx/cuda/execution/max_total_num_items.pass.cpp b/libcudacxx/test/libcudacxx/cuda/execution/max_total_num_items.pass.cpp
new file mode 100644
index 00000000000..e69c2a2b952
--- /dev/null
+++ b/libcudacxx/test/libcudacxx/cuda/execution/max_total_num_items.pass.cpp
@@ -0,0 +1,84 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of libcu++, the C++ Standard Library for your entire system,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cuda/execution.max_total_num_items.h>
+
+#include <cuda/std/cassert>
+#include <cuda/std/cstdint>
+#include <cuda/std/limits>
+#include <cuda/std/type_traits>
+
+#include "test_macros.h"
+
+TEST_FUNC void test()
+{
+  namespace exec = cuda::execution;
+
+  // (a) static upper bound: the element type is inferred from the non-type template parameter.
+  {
+    const auto guarantee = exec::max_total_num_items<1000>();
+    using holder_t       = cuda::std::remove_cvref_t<decltype(guarantee)>;
+    static_assert(cuda::std::is_base_of_v<exec::__guarantee, holder_t>);
+    static_assert(cuda::std::is_same_v<holder_t::element_type, int>);
+    static_assert(holder_t::static_highest == 1000);
+    assert(guarantee.highest() == 1000);
+  }
+
+  // A bound that does not fit into int infers a wider type, distinguishing 32-bit from 64-bit bounds.
+  {
+    const auto guarantee = exec::max_total_num_items<5'000'000'000>();
+    using holder_t       = cuda::std::remove_cvref_t<decltype(guarantee)>;
+    static_assert(sizeof(holder_t::element_type) == 8);
+    static_assert(holder_t::static_highest == 5'000'000'000);
+  }
+
+  // The element type can be selected explicitly through the literal type.
+  {
+    const auto guarantee = exec::max_total_num_items<cuda::std::int16_t{1000}>();
+    using holder_t       = cuda::std::remove_cvref_t<decltype(guarantee)>;
+    static_assert(cuda::std::is_same_v<holder_t::element_type, cuda::std::int16_t>);
+  }
+
+  // (b) runtime upper bound: the element type is inferred from the argument, the static bound spans the whole type.
+  {
+    const auto guarantee = exec::max_total_num_items(cuda::std::int32_t{1'000'000'000});
+    using holder_t       = cuda::std::remove_cvref_t<decltype(guarantee)>;
+    static_assert(cuda::std::is_same_v<holder_t::element_type, cuda::std::int32_t>);
+    static_assert(holder_t::static_highest == (cuda::std::numeric_limits<cuda::std::int32_t>::max)());
+    assert(guarantee.highest() == 1'000'000'000);
+  }
+
+  // (c) both static and runtime upper bounds; the runtime bound is narrower than the static one.
+  {
+    const auto guarantee = exec::max_total_num_items<1000>(500);
+    using holder_t       = cuda::std::remove_cvref_t<decltype(guarantee)>;
+    static_assert(holder_t::static_highest == 1000);
+    assert(guarantee.highest() == 500);
+  }
+
+  // The query returns the guarantee itself, preserving both the compile-time and the runtime bounds.
+  {
+    const auto guarantee = exec::max_total_num_items<1000>(500);
+    const auto resolved  = exec::__get_max_total_num_items(guarantee);
+    using holder_t       = cuda::std::remove_cvref_t<decltype(resolved)>;
+    static_assert(holder_t::static_highest == 1000);
+    assert(resolved.highest() == 500);
+  }
+
+  // The query is a forwarding query, just like the requirement queries.
+  static_assert(cuda::std::execution::forwarding_query(exec::__get_max_total_num_items_t{}));
+}
+
+int main(int, char**)
+{
+  test();
+
+  return 0;
+}

From 90b75814fa6843af20e014af2abb112cd0b58ba1 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Fri, 5 Jun 2026 01:53:59 -0700
Subject: [PATCH 2/4] fixes format

---
 libcudacxx/include/cuda/__execution/guarantee.h          | 9 +++++----
 .../test/libcudacxx/cuda/execution/guarantee.pass.cpp    | 5 ++---
 .../cuda/execution/max_total_num_items.pass.cpp          | 1 -
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/libcudacxx/include/cuda/__execution/guarantee.h b/libcudacxx/include/cuda/__execution/guarantee.h
index 485a096ff96..03f99da8757 100644
--- a/libcudacxx/include/cuda/__execution/guarantee.h
+++ b/libcudacxx/include/cuda/__execution/guarantee.h
@@ -33,8 +33,9 @@ _CCCL_BEGIN_NAMESPACE_CUDA_EXECUTION
 //!
 //! A guarantee is a promise that the caller makes to an algorithm about its input or the problem being solved (e.g. an
 //! upper bound on the total number of items). Algorithms may exploit guarantees to select faster code paths or smaller
-//! intermediate types. This is the dual of @c cuda::execution::__requirement, which describes a property that the caller
-//! demands from the algorithm. Unlike requirements, guarantees may be stateful, i.e. they may carry a runtime value.
+//! intermediate types. This is the dual of @c cuda::execution::__requirement, which describes a property that the
+//! caller demands from the algorithm. Unlike requirements, guarantees may be stateful, i.e. they may carry a runtime
+//! value.
 class __guarantee
 {};
 
@@ -61,8 +62,8 @@ _CCCL_GLOBAL_CONSTANT auto __get_guarantees = __get_guarantees_t{};
 //! @brief Bundles a pack of guarantees into an environment that can be passed to device-wide parallel algorithms.
 //!
 //! The returned property is keyed by @c __get_guarantees_t so that individual guarantees are only visible to algorithms
-//! through the guarantees environment, mirroring how @c cuda::execution::require exposes requirements. Each guarantee is
-//! stored by value, preserving any runtime state it carries.
+//! through the guarantees environment, mirroring how @c cuda::execution::require exposes requirements. Each guarantee
+//! is stored by value, preserving any runtime state it carries.
 template <class... _Guarantees>
 [[nodiscard]] _CCCL_NODEBUG_API auto guarantee(_Guarantees... __guarantees)
 {
diff --git a/libcudacxx/test/libcudacxx/cuda/execution/guarantee.pass.cpp b/libcudacxx/test/libcudacxx/cuda/execution/guarantee.pass.cpp
index 8eb43d3baee..0d67e6bec4d 100644
--- a/libcudacxx/test/libcudacxx/cuda/execution/guarantee.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/execution/guarantee.pass.cpp
@@ -10,7 +10,6 @@
 
 #include <cuda/execution.guarantee.h>
 #include <cuda/execution.max_total_num_items.h>
-
 #include <cuda/std/cassert>
 #include <cuda/std/cstdint>
 #include <cuda/std/type_traits>
@@ -21,8 +20,8 @@ TEST_FUNC void test()
 {
   namespace exec = cuda::execution;
 
-  // A guarantee is only visible to an algorithm through the guarantees environment produced by guarantee(...), mirroring
-  // how requirements are only visible through the requirements environment produced by require(...).
+  // A guarantee is only visible to an algorithm through the guarantees environment produced by guarantee(...),
+  // mirroring how requirements are only visible through the requirements environment produced by require(...).
   const auto genv     = exec::guarantee(exec::max_total_num_items<1000>());
   const auto resolved = exec::__get_max_total_num_items(exec::__get_guarantees(genv));
   static_assert(cuda::std::is_base_of_v<exec::__guarantee, cuda::std::remove_cvref_t<decltype(resolved)>>);
diff --git a/libcudacxx/test/libcudacxx/cuda/execution/max_total_num_items.pass.cpp b/libcudacxx/test/libcudacxx/cuda/execution/max_total_num_items.pass.cpp
index e69c2a2b952..8220878dccb 100644
--- a/libcudacxx/test/libcudacxx/cuda/execution/max_total_num_items.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/execution/max_total_num_items.pass.cpp
@@ -9,7 +9,6 @@
 //===----------------------------------------------------------------------===//
 
 #include <cuda/execution.max_total_num_items.h>
-
 #include <cuda/std/cassert>
 #include <cuda/std/cstdint>
 #include <cuda/std/limits>

From 38ff9c881557f7eceb839816abd62806b124b14f Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Fri, 5 Jun 2026 02:08:45 -0700
Subject: [PATCH 3/4] includes test_macros header

---
 libcudacxx/test/libcudacxx/cuda/execution/guarantee.fail.cpp    | 2 ++
 .../test/libcudacxx/cuda/execution/max_total_num_items.fail.cpp | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/libcudacxx/test/libcudacxx/cuda/execution/guarantee.fail.cpp b/libcudacxx/test/libcudacxx/cuda/execution/guarantee.fail.cpp
index b1301ec12e2..a6174f5eb4a 100644
--- a/libcudacxx/test/libcudacxx/cuda/execution/guarantee.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/execution/guarantee.fail.cpp
@@ -10,6 +10,8 @@
 
 #include <cuda/execution.guarantee.h>
 
+#include "test_macros.h"
+
 [[maybe_unused]] _CCCL_GLOBAL_CONSTANT struct query_t
 {
 } query{};
diff --git a/libcudacxx/test/libcudacxx/cuda/execution/max_total_num_items.fail.cpp b/libcudacxx/test/libcudacxx/cuda/execution/max_total_num_items.fail.cpp
index d9c4d398fc2..baffa2de37b 100644
--- a/libcudacxx/test/libcudacxx/cuda/execution/max_total_num_items.fail.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/execution/max_total_num_items.fail.cpp
@@ -10,6 +10,8 @@
 
 #include <cuda/execution.max_total_num_items.h>
 
+#include "test_macros.h"
+
 TEST_FUNC void test()
 {
   // The bound must be of integral type: a floating-point argument has no viable overload.

From 1a1e06b4230f6a5bf21575f7a36324a5faa68820 Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Sat, 6 Jun 2026 22:36:29 -0700
Subject: [PATCH 4/4] fixes unuser warning

---
 .../test/libcudacxx/cuda/execution/max_total_num_items.pass.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libcudacxx/test/libcudacxx/cuda/execution/max_total_num_items.pass.cpp b/libcudacxx/test/libcudacxx/cuda/execution/max_total_num_items.pass.cpp
index 8220878dccb..86081a0d029 100644
--- a/libcudacxx/test/libcudacxx/cuda/execution/max_total_num_items.pass.cpp
+++ b/libcudacxx/test/libcudacxx/cuda/execution/max_total_num_items.pass.cpp
@@ -36,6 +36,7 @@ TEST_FUNC void test()
     using holder_t       = cuda::std::remove_cvref_t<decltype(guarantee)>;
     static_assert(sizeof(holder_t::element_type) == 8);
     static_assert(holder_t::static_highest == 5'000'000'000);
+    assert(guarantee.highest() == 5'000'000'000);
   }
 
   // The element type can be selected explicitly through the literal type.
@@ -43,6 +44,7 @@ TEST_FUNC void test()
     const auto guarantee = exec::max_total_num_items<cuda::std::int16_t{1000}>();
     using holder_t       = cuda::std::remove_cvref_t<decltype(guarantee)>;
     static_assert(cuda::std::is_same_v<holder_t::element_type, cuda::std::int16_t>);
+    assert(guarantee.highest() == 1000);
   }
 
   // (b) runtime upper bound: the element type is inferred from the argument, the static bound spans the whole type.