Skip to content
Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
141 changes: 141 additions & 0 deletions include/cuco/detail/reduction_functor_impl.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
/*
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cuda/atomic>
#include <type_traits>
Comment thread
sleeepyjack marked this conversation as resolved.
Outdated

namespace cuco {
namespace detail {

/**
* @brief Base class of all reduction functors.
*
* @warning This class should not be used directly.
*
*/
class reduction_functor_base {
};

template <typename T, typename Enable = void>
struct reduce_add_impl {
template <cuda::thread_scope Scope>
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
{
return lhs.fetch_add(rhs) + rhs;
Comment thread
sleeepyjack marked this conversation as resolved.
Outdated
}
};

template <typename T, typename Enable = void>
struct reduce_min_impl {
template <cuda::thread_scope Scope>
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
{
return min(lhs.fetch_min(rhs), rhs);
}
};

template <typename T, typename Enable = void>
struct reduce_max_impl {
template <cuda::thread_scope Scope>
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
{
return max(lhs.fetch_max(rhs), rhs);
}
};

template <typename T, typename Enable = void>
struct reduce_count_impl {
template <cuda::thread_scope Scope>
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& /* rhs */) const noexcept
{
return ++lhs;
}
};

// remove the following WAR once libcu++ extends FP atomics support and fixes signed integer atomics
// https://github.com/NVIDIA/libcudacxx/pull/286
template <typename T>
struct reduce_add_impl<
T,
typename cuda::std::enable_if<cuda::std::is_floating_point<T>::value>::type> {
Comment thread
sleeepyjack marked this conversation as resolved.
Outdated
template <cuda::thread_scope Scope>
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T rhs) const noexcept
{
if constexpr (Scope == cuda::thread_scope_system)
return atomicAdd_system(reinterpret_cast<T*>(&lhs), rhs) + rhs;
else if constexpr (Scope == cuda::thread_scope_device)
return atomicAdd(reinterpret_cast<T*>(&lhs), rhs) + rhs;
else
return atomicAdd_block(reinterpret_cast<T*>(&lhs), rhs) + rhs;
}
};

template <typename T>
struct reduce_min_impl<T,
typename cuda::std::enable_if<cuda::std::is_integral<T>::value &&
cuda::std::is_signed<T>::value>::type> {
Comment thread
sleeepyjack marked this conversation as resolved.
Outdated
template <cuda::thread_scope Scope>
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
{
using InternalT = typename cuda::std::conditional<sizeof(T) == 8, long long int, int>::type;
Comment thread
sleeepyjack marked this conversation as resolved.
Outdated
InternalT* ptr = reinterpret_cast<InternalT*>(&lhs);
InternalT value = rhs;
if constexpr (Scope == cuda::thread_scope_system)
return min(atomicMin_system(ptr, value), value);
else if constexpr (Scope == cuda::thread_scope_device)
return min(atomicMin(ptr, value), value);
else
return min(atomicMin_block(ptr, value), value);
}
};

template <typename T>
struct reduce_max_impl<T,
typename cuda::std::enable_if<cuda::std::is_integral<T>::value &&
cuda::std::is_signed<T>::value>::type> {
Comment thread
sleeepyjack marked this conversation as resolved.
Outdated
template <cuda::thread_scope Scope>
__device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept
{
using InternalT = typename cuda::std::conditional<sizeof(T) == 8, long long int, int>::type;
Comment thread
sleeepyjack marked this conversation as resolved.
Outdated
InternalT* ptr = reinterpret_cast<InternalT*>(&lhs);
InternalT value = rhs;
if constexpr (Scope == cuda::thread_scope_system)
return max(atomicMax_system(ptr, value), value);
else if constexpr (Scope == cuda::thread_scope_device)
return max(atomicMax(ptr, value), value);
else
return max(atomicMax_block(ptr, value), value);
}
};

template <typename T>
struct reduce_min_impl<
T,
typename cuda::std::enable_if<cuda::std::is_floating_point<T>::value>::type> {
Comment thread
sleeepyjack marked this conversation as resolved.
Outdated
__device__ T operator()(T lhs, T rhs) const noexcept { return min(lhs, rhs); }
};

template <typename T>
struct reduce_max_impl<
T,
typename cuda::std::enable_if<cuda::std::is_floating_point<T>::value>::type> {
Comment thread
sleeepyjack marked this conversation as resolved.
Outdated
__device__ T operator()(T lhs, T rhs) const noexcept { return max(lhs, rhs); }
};

} // namespace detail
} // namespace cuco
194 changes: 194 additions & 0 deletions include/cuco/reduction_functors.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <atomic>
#include <cuco/detail/reduction_functor_impl.cuh>

#include <cuda/atomic>
#include <limits>
#include <type_traits>
Comment thread
sleeepyjack marked this conversation as resolved.
Outdated

namespace cuco {

/**
* @brief Wrapper for reduction identity value.
*
* @tparam T The underlying value type used for reduction
*/
template <typename T>
class identity_value {
public:
using type = T;
constexpr identity_value(T const& identity) noexcept : identity_(identity) {}
Comment thread
sleeepyjack marked this conversation as resolved.
Outdated
constexpr T value() const noexcept { return identity_; }

private:
T identity_;
};

/**
* @brief Wrapper for a user-defined custom reduction operator.
*
* External synchronization, if required,
* is established via an atomic compare-and-swap loop.
*
* Example:
* \code{.cpp}
* template <typename T>
* struct custom_plus {
* __device__ T operator()(T const& lhs, T const& rhs) const noexcept {
* return lhs + rhs;
* }
* };
*
* template <typename T>
* struct custom_plus_sync {
* template <cuda::thread_scope Scope>
* __device__ T operator()(cuda::atomic<T, Scope>& lhs, T const& rhs) const noexcept {
* return lhs.fetch_add(rhs) + rhs;
* }
* };
*
* int main() {
* cuco::identity_value<int> identity{0}; // define the identity value for the given reduction
* operation, i.e., op(identity, x) == x
*
* auto f1 = cuco::reduction_functor<custom_plus<int>, int>(identity); // synchronized via
* CAS-loop auto f2 = cuco::reduction_functor<custom_plus_sync<int>, int>(identity); // implicitly
* synchronized
* }
* \endcode
*
* @tparam Func The user-defined reduction functor
* @tparam Value The value type used for reduction
*/
template <typename Func, typename Value>
class reduction_functor : detail::reduction_functor_base {
public:
using value_type = Value;

reduction_functor(cuco::identity_value<Value> identity, Func functor = Func{}) noexcept
: identity_(identity), functor_(functor)
{
}

template <cuda::thread_scope Scope>
__device__ value_type operator()(cuda::atomic<value_type, Scope>& lhs,
value_type const& rhs) const noexcept
{
if constexpr (uses_external_sync()) {
value_type old = lhs.load(cuda::memory_order_relaxed);
value_type desired;

do {
desired = functor_(old, rhs);
} while (!lhs.compare_exchange_weak(
old, desired, cuda::memory_order_release, cuda::memory_order_relaxed));

return desired;
} else {
return functor_(lhs, rhs);
}
}

__host__ __device__ value_type identity() const noexcept { return identity_.value(); }

__host__ __device__ static constexpr bool uses_external_sync() noexcept
{
return !atomic_invocable_ || naive_invocable_;
}

private:
cuco::identity_value<value_type> identity_;
Func functor_;
static constexpr bool naive_invocable_ =
std::is_invocable_r<value_type, Func, value_type, value_type>::value;
Comment thread
sleeepyjack marked this conversation as resolved.
Outdated
static constexpr bool atomic_invocable_ =
std::is_invocable_r<value_type,
Func,
cuda::atomic<value_type, cuda::thread_scope_system>&,
value_type>::value ||
Comment thread
sleeepyjack marked this conversation as resolved.
Outdated
std::is_invocable_r<value_type,
Func,
cuda::atomic<value_type, cuda::thread_scope_device>&,
value_type>::value ||
std::is_invocable_r<value_type,
Func,
cuda::atomic<value_type, cuda::thread_scope_block>&,
value_type>::value ||
std::is_invocable_r<value_type,
Func,
cuda::atomic<value_type, cuda::thread_scope_thread>&,
value_type>::value;

static_assert(atomic_invocable_ || naive_invocable_,
"Invalid operator signature. Valid signatures are "
"(T const&, T const&)->T and (cuda::atomic<T, Scope>&, T const&)->T.");
static_assert(!(__nv_is_extended_device_lambda_closure_type(Func) ||
__nv_is_extended_host_device_lambda_closure_type(Func)),
"Extended __device__/__host__ __device__ lambdas are not supported."
" Use a named function object instead.");
};

/**
* @brief Synchronized `+` reduction functor.
*
* @tparam T The value type used for reduction
*/
template <typename T>
auto reduce_add()
{
return reduction_functor(identity_value<T>{0}, detail::reduce_add_impl<T>{});
};

/**
* @brief Synchronized `min` reduction functor.
*
* @tparam T The value type used for reduction
*/
template <typename T>
auto reduce_min()
{
return reduction_functor(identity_value{cuda::std::numeric_limits<T>::max()},
detail::reduce_min_impl<T>{});
};

/**
* @brief Synchronized `max` reduction functor.
*
* @tparam T The value type used for reduction
*/
template <typename T>
auto reduce_max()
{
return reduction_functor(identity_value{cuda::std::numeric_limits<T>::lowest()},
detail::reduce_max_impl<T>{});
};

/**
* @brief Synchronized `count` reduction functor.
*
* @tparam T The value type used for reduction
*/
template <typename T>
auto reduce_count()
{
return reduction_functor(identity_value<T>{0}, detail::reduce_count_impl<T>{});
};

} // namespace cuco
7 changes: 6 additions & 1 deletion tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ include(CTest)
CPMAddPackage(
NAME Catch2
GITHUB_REPOSITORY catchorg/Catch2
VERSION 2.11.1
VERSION 2.13.9
)

if(Catch2_ADDED)
Expand Down Expand Up @@ -86,3 +86,8 @@ ConfigureTest(STATIC_MULTIMAP_TEST
static_multimap/multiplicity_test.cu
static_multimap/non_match_test.cu
static_multimap/pair_function_test.cu)

###################################################################################################
# - static_reduction_map tests --------------------------------------------------------------------
ConfigureTest(STATIC_REDUCTION_MAP_TEST
static_reduction_map/reduction_functors_test.cu)
Loading