Skip to content
Open
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions docs/libcudacxx/extended_api/warp.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Warp

warp/warp_shuffle
warp/warp_match_all
warp/warp_match_any
warp/lane_mask

.. list-table::
Expand Down Expand Up @@ -45,6 +46,11 @@ Warp
- CCCL 3.1.0
- CUDA 13.1

* - :ref:`warp_match_any <libcudacxx-extended-api-warp-warp-match-any>`
- Get the mask of lanes with the same value
- CCCL 3.5.0
- CUDA 13.5

* - :ref:`lane_mask <libcudacxx-extended-api-warp-lane-mask>`
- Class to represent a mask of lanes in a warp
- CCCL 3.1.0
Expand Down
18 changes: 11 additions & 7 deletions docs/libcudacxx/extended_api/warp/warp_match_all.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ Defined in ``<cuda/warp>`` header.
The functionality provides a generalized and safe alternative to CUDA warp match all intrinsic ``__match_all_sync``.
The function allows bitwise comparison of any data size, including raw arrays, pointers, and structs.

.. note::

The underlying CUDA intrinsic does not provide memory ordering.

**Parameters**

- ``data``: data to compare.
Expand All @@ -30,7 +34,7 @@ The function allows bitwise comparison of any data size, including raw arrays, p
**Constraints**

- ``T`` shall be trivially copyable, see :ref:`cuda::is_trivially_copyable <libcudacxx-extended-api-type_traits-is_trivially_copyable>`.
- When ``__builtin_clear_padding`` is not supported, ``T`` shall have no padding bits, that is, ``T``'s value representation shall be identical to its object representation.
- ``T`` shall be bitwise comparable, see :ref:`cuda::is_bitwise_comparable <libcudacxx-extended-api-type_traits-is_bitwise_comparable>`, except when ``__builtin_clear_padding`` is supported. In the latter case, ``T`` can have padding bits.

**Preconditions**

Expand All @@ -39,17 +43,17 @@ The function allows bitwise comparison of any data size, including raw arrays, p

**Undefined Behavior**

- ``lane_mask`` must represent a subset of the active lanes, undefined behavior otherwise.
- ``lane_mask`` must represent a subset of the active lanes.
- All non-exited lanes specified by ``lane_mask`` must execute the function with the same ``lane_mask`` value.

**Performance considerations**

- The function calls the PTX instruction ``match.sync`` :math:`ceil\left(\frac{sizeof(data)}{4}\right)` times.
- The function is slightly faster when called with a mask of all active lanes (overload function) even if all lanes participates in the call.
- The function is slower when called with a non-fully active warp.
- The function is faster when called with a mask representing all active lanes in a warp (default value of the second parameter ``lane_mask``).

**References**

- `CUDA match_all Intrinsics <https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-match-functions>`_
- `CUDA match_all Intrinsics <https://docs.nvidia.com/cuda/cuda-programming-guide/05-appendices/cpp-language-extensions.html#warp-match-functions>`_
- `PTX match.sync instruction <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync>`_

Example
Expand All @@ -69,7 +73,7 @@ Example
__global__ void warp_match_kernel() {
assert(cuda::device::warp_match_all(2));
assert(cuda::device::warp_match_all(2, cuda::device::lane_mask::all()));
assert(cuda::device::warp_match_all(MyStruct{1.0, 3})); // Undefined Behavior
assert(cuda::device::warp_match_all(MyStruct{1.0, 3})); // compile error, except when __builtin_clear_padding is supported
assert(!cuda::device::warp_match_all(threadIdx.x));
}

Expand All @@ -79,4 +83,4 @@ Example
return 0;
}

`See it on Godbolt 🔗 <https://godbolt.org/z/Eq81fTb8z>`_
`See it on Godbolt 🔗 <https://godbolt.org/z/x1sWbx14r>`_
98 changes: 98 additions & 0 deletions docs/libcudacxx/extended_api/warp/warp_match_any.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
.. _libcudacxx-extended-api-warp-warp-match-any:

``cuda::device::warp_match_any``
================================

Defined in ``<cuda/warp>`` header.

.. code:: cuda

namespace cuda::device {

template <typename T>
[[nodiscard]] __device__ lane_mask
warp_match_any(const T& data, lane_mask = lane_mask::all());

} // namespace cuda::device

The functionality provides a generalized and safe alternative to CUDA warp match any intrinsic ``__match_any_sync``.
The function allows bitwise comparison of any data size, including raw arrays, pointers, and structs.

.. note::

The underlying CUDA intrinsic does not provide memory ordering.

**Parameters**

- ``data``: data to compare.
- ``lane_mask``: mask of the active lanes.

**Return value**

- A ``lane_mask`` representing the non-exited lanes in ``lane_mask`` that have the same bitwise value for ``data`` as the calling lane.

**Constraints**

- ``T`` shall be trivially copyable, see :ref:`cuda::is_trivially_copyable <libcudacxx-extended-api-type_traits-is_trivially_copyable>`.
- When ``__builtin_clear_padding`` is not supported, ``T`` shall have no padding bits, that is, ``T``'s value representation shall be identical to its object representation.

**Preconditions**

- The functionality is only supported on ``SM >= 70``.
- ``lane_mask`` must be non-zero.

**Undefined Behavior**

- ``lane_mask`` must represent a subset of the active lanes.
- All non-exited lanes specified by ``lane_mask`` must execute the function with the same ``lane_mask`` value.

**Performance considerations**

- The function calls the PTX instruction ``match.sync`` :math:`ceil\left(\frac{sizeof(data)}{4}\right)` times.
- The function is faster when called with a mask representing all active lanes in a warp (default value of the second parameter ``lane_mask``).

**References**

- `CUDA match_any Intrinsics <https://docs.nvidia.com/cuda/cuda-programming-guide/05-appendices/cpp-language-extensions.html#warp-match-functions>`_
- `PTX match.sync instruction <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-match-sync>`_

Example
-------

.. code:: cuda

#include <cuda/std/array>
#include <cuda/std/cassert>
#include <cuda/warp>

struct MyStruct {
double x; // 8 bytes
int y; // 4 bytes
}; // 4 bytes of padding

__global__ void warp_match_kernel() {
{
auto mask = cuda::device::warp_match_any(threadIdx.x / 4);
auto expected = cuda::device::lane_mask{0b1111 << ((threadIdx.x / 4) * 4)};
assert(mask == expected);
}
{
auto mask = cuda::device::warp_match_any(2);
auto expected = cuda::device::lane_mask{0xFFFFFFFF};
assert(mask == expected);
}
{
// compile error, except when __builtin_clear_padding is supported
auto mask = cuda::device::warp_match_any(MyStruct{1.0, 3});
auto expected = cuda::device::lane_mask{0xFFFFFFFF};
assert(mask == expected);
}
}

int main() {
warp_match_kernel<<<1, 32>>>();
cudaDeviceSynchronize();
return 0;
}

`See it on Godbolt 🔗 <https://godbolt.org/z/Ys1McG8nv>`_
84 changes: 84 additions & 0 deletions libcudacxx/include/cuda/__warp/warp_match_any.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA___WARP_WARP_MATCH_ANY_H
#define _CUDA___WARP_WARP_MATCH_ANY_H

#include <cuda/std/detail/__config>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#if _CCCL_CUDA_COMPILATION()

# include <cuda/__cmath/ceil_div.h>
# include <cuda/__type_traits/is_bitwise_comparable.h>
# include <cuda/__type_traits/is_trivially_copyable.h>
# include <cuda/__warp/lane_mask.h>
# include <cuda/std/__cstring/memcpy.h>
# include <cuda/std/__memory/addressof.h>
# include <cuda/std/cstdint>

# include <cuda/std/__cccl/prologue.h>

_CCCL_BEGIN_NAMESPACE_CUDA_DEVICE

extern "C" _CCCL_DEVICE void __cuda__match_any_sync_is_not_supported_before_SM_70__();

//! @brief Returns the mask of lanes with the same bitwise value as the calling lane.
//!
//! @param[in] __data The data to compare across lanes.
//! @param[in] __lane_mask The mask of participating lanes.
//!
//! @return A lane mask containing lanes in `__lane_mask` whose `__data` matches the calling lane's data.
template <class _Tp>
[[nodiscard]] _CCCL_DEVICE_API lane_mask
warp_match_any(const _Tp& __data, const lane_mask __lane_mask = lane_mask::all()) noexcept
{
static_assert(is_trivially_copyable_v<_Tp>, "data must be trivially copyable");
_CCCL_ASSERT(__lane_mask != lane_mask::none(), "lane_mask must be non-zero");

constexpr int __ratio = ::cuda::ceil_div(sizeof(_Tp), sizeof(::cuda::std::uint32_t));
::cuda::std::uint32_t __array[__ratio]{};

# if defined(_CCCL_BUILTIN_CLEAR_PADDING)
auto __data_copy = __data;
_CCCL_BUILTIN_CLEAR_PADDING(&__data_copy);
const auto __data_ptr = ::cuda::std::addressof(__data_copy);
Comment on lines +57 to +59
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Important: This introduces a needless copy. I believe this should only copy if is_bitwise_comparable_v is false

Copy link
Copy Markdown
Contributor Author

@fbusato fbusato Jun 4, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, I'm aware of this issue. However, is_bitwise_comparable_v also checks padding. I need to introduce another (internal) traits for that. I will open a second PR

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fine by me, although I would like to have an integer overload that does not do any of that and just forwards to the builtin

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it is needed. Direct call and cuda::device::warp_match_any produce identical code as expected https://godbolt.org/z/sKaWz5dv1

# else // ^^^ _CCCL_BUILTIN_CLEAR_PADDING ^^^ / vvv !_CCCL_BUILTIN_CLEAR_PADDING vvv
static_assert(is_bitwise_comparable_v<_Tp>, "data must be bitwise comparable");
const auto __data_ptr = ::cuda::std::addressof(__data);
# endif // _CCCL_BUILTIN_CLEAR_PADDING
::cuda::std::memcpy(__array, __data_ptr, sizeof(_Tp));

lane_mask __ret = __lane_mask;
_CCCL_PRAGMA_UNROLL_FULL()
for (int i = 0; i < __ratio; ++i)
{
::cuda::std::uint32_t __match_any_result = 0;
NV_IF_ELSE_TARGET(NV_PROVIDES_SM_70,
(__match_any_result = ::__match_any_sync(__lane_mask.value(), __array[i]);),
(::cuda::device::__cuda__match_any_sync_is_not_supported_before_SM_70__();));
__ret &= lane_mask{__match_any_result};
}
return __ret;
}

_CCCL_END_NAMESPACE_CUDA_DEVICE

# include <cuda/std/__cccl/epilogue.h>

#endif // _CCCL_CUDA_COMPILATION()
#endif // _CUDA___WARP_WARP_MATCH_ANY_H
1 change: 1 addition & 0 deletions libcudacxx/include/cuda/warp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

#include <cuda/__warp/lane_mask.h>
#include <cuda/__warp/warp_match_all.h>
#include <cuda/__warp/warp_match_any.h>
#include <cuda/__warp/warp_shuffle.h>

#endif // _CUDA_WARP
97 changes: 97 additions & 0 deletions libcudacxx/test/libcudacxx/cuda/warp/warp_match_any.pass.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
//===----------------------------------------------------------------------===//
//
// Part of the libcu++ Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: pre-sm-70

// UNSUPPORTED: enable-tile
// error: asm statement is unsupported in tile code

#include <cuda/std/array>
#include <cuda/std/cassert>
#include <cuda/std/cstdint>
#include <cuda/warp>

#include "test_macros.h"

TEST_DEVICE_FUNC uint32_t make_low_mask(unsigned count)
{
return count == 32 ? 0xFFFFFFFF : ((1u << count) - 1);
}

TEST_DEVICE_FUNC uint32_t make_stride_mask(unsigned count, unsigned step, unsigned remainder)
{
uint32_t mask = 0;
for (unsigned lane = 0; lane < count; ++lane)
{
if ((lane % step) == remainder)
{
mask |= uint32_t{1} << lane;
}
}
return mask;
}

template <typename T>
TEST_DEVICE_FUNC void test_all_equal(T value = T{})
{
for (unsigned i = 1; i <= 32; ++i)
{
auto mask = cuda::device::lane_mask{make_low_mask(i)};
if (threadIdx.x < i)
{
assert(cuda::device::warp_match_any(value, mask) == mask);
}
}
}

// two different groups of lanes
template <typename T>
TEST_DEVICE_FUNC void test_grouped(T valueA = T{}, T valueB = T{1})
{
for (unsigned i = 2; i <= 32; ++i)
{
auto mask = cuda::device::lane_mask{make_low_mask(i)};
if (threadIdx.x < i)
{
auto value = threadIdx.x % 2 == 0 ? valueA : valueB;
auto expected = cuda::device::lane_mask{make_stride_mask(i, 2, threadIdx.x % 2)};
assert(cuda::device::warp_match_any(value, mask) == expected);
}
}
}

__global__ void test_kernel()
Comment thread
fbusato marked this conversation as resolved.
Outdated
{
using array_t = cuda::std::array<char, 6>;

test_all_equal<uint8_t>();
test_all_equal<uint16_t>();
test_all_equal<uint32_t>();
test_all_equal<uint64_t>();
#if _CCCL_HAS_INT128()
test_all_equal<__uint128_t>();
#endif
Comment thread
fbusato marked this conversation as resolved.
Outdated
test_all_equal(char3{0, 0, 0});
test_all_equal(array_t{0, 0, 0, 0, 0, 0});

test_grouped<uint8_t>();
test_grouped<uint16_t>();
test_grouped<uint32_t>();
test_grouped<uint64_t>();
#if _CCCL_HAS_INT128()
test_grouped<__uint128_t>();
#endif
Comment thread
fbusato marked this conversation as resolved.
Outdated
test_grouped(char3{0, 0, 0}, char3{1, 1, 1});
test_grouped(array_t{0, 0, 0, 0, 0, 0}, array_t{1, 1, 1, 1, 1, 1});
}

int main(int, char**)
{
NV_IF_TARGET(NV_IS_HOST, (test_kernel<<<1, 32>>>();))
return 0;
}
Loading