From ad8e3f6bf13642c81cebd76cd6f03ac4c1061906 Mon Sep 17 00:00:00 2001 From: David Beckingsale Date: Mon, 31 Mar 2025 14:21:56 -0700 Subject: [PATCH 01/26] Initial commit of new ops --- include/umpire/op.hpp | 17 + include/umpire/op/cuda.hpp | 469 +++++++++++++++++++++++++ include/umpire/op/dispatch.hpp | 113 ++++++ include/umpire/op/hip.hpp | 499 +++++++++++++++++++++++++++ include/umpire/op/host.hpp | 153 ++++++++ include/umpire/op/operations.hpp | 78 +++++ include/umpire/resource/platform.hpp | 56 +++ src/umpire/CMakeLists.txt | 1 + src/umpire/ResourceManager.cpp | 41 +-- src/umpire/ResourceManager.hpp | 6 + 10 files changed, 1413 insertions(+), 20 deletions(-) create mode 100644 include/umpire/op.hpp create mode 100644 include/umpire/op/cuda.hpp create mode 100644 include/umpire/op/dispatch.hpp create mode 100644 include/umpire/op/hip.hpp create mode 100644 include/umpire/op/host.hpp create mode 100644 include/umpire/op/operations.hpp create mode 100644 include/umpire/resource/platform.hpp diff --git a/include/umpire/op.hpp b/include/umpire/op.hpp new file mode 100644 index 000000000..4130009aa --- /dev/null +++ b/include/umpire/op.hpp @@ -0,0 +1,17 @@ +#pragma once + +#include "umpire/config.hpp" + +#include "umpire/op/operations.hpp" +#include "umpire/op/host.hpp" +#if defined(UMPIRE_ENABLE_CUDA) +#include "umpire/op/cuda.hpp" +#endif +#if defined(UMPIRE_ENABLE_HIP) +#include "umpire/op/hip.hpp" +#endif +#if defined(UMPIRE_ENABLE_OPENMP_TARGET) +#include "umpire/op/omp_target.hpp" +#endif + +#include "umpire/op/dispatch.hpp" \ No newline at end of file diff --git a/include/umpire/op/cuda.hpp b/include/umpire/op/cuda.hpp new file mode 100644 index 000000000..f2fa13aff --- /dev/null +++ b/include/umpire/op/cuda.hpp @@ -0,0 +1,469 @@ +#pragma once + +#include "umpire/resource/platform.hpp" +#include "umpire/util/error.hpp" +#include "umpire/util/Platform.hpp" + +// Forward declaration of kernel for launching directly in device code if needed +extern "C" { +__global__ void +umpire_cuda_fill(void* data, int value, std::size_t length); +} + +namespace { + template + struct get_kind; + + template<> + struct get_kind { + static constexpr cudaMemcpyKind value = cudaMemcpyDeviceToHost; + }; + + template<> + struct get_kind { + static constexpr cudaMemcpyKind value = cudaMemcpyHostToDevice; + }; + + template<> + struct get_kind { + static constexpr cudaMemcpyKind value = cudaMemcpyDeviceToDevice; + }; +} + +namespace umpire { +namespace op { + +namespace { + // Helper function to check if a CUDA device supports managed memory features + inline bool check_device_managed_memory(int device) { + cudaDeviceProp properties; + cudaError_t error = ::cudaGetDeviceProperties(&properties, device); + + if (error != cudaSuccess) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format("cudaGetDeviceProperties for device {} failed with error: {}", + device, cudaGetErrorString(error))); + } + + return (properties.managedMemory == 1 && properties.concurrentManagedAccess == 1); + } + + // Generic function to handle cudaMemAdvise operations + template + inline void advise_impl(T* ptr, std::size_t n, int device, cudaMemoryAdvise advice) { + std::size_t size = n; + if (std::is_same::value) { + // void pointers don't have a size + } else { + size = sizeof(T) * n; + } + + if (check_device_managed_memory(device)) { + cudaError_t error = ::cudaMemAdvise(ptr, size, advice, device); + + if (error != cudaSuccess) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format("cudaMemAdvise(ptr={}, size={}, advice={}, device={}) failed with error: {}", + ptr, size, static_cast(advice), device, cudaGetErrorString(error))); + } + } + } + + // Generic copy function that handles the different kinds of copies + template + inline void copy_impl(T* src, T* dst, std::size_t len, cudaMemcpyKind kind) { + std::size_t size = len; + if (!std::is_same::value) { + size = sizeof(T) * len; + } + + cudaError_t error = ::cudaMemcpy(dst, src, size, kind); + if (error != cudaSuccess) { + UMPIRE_ERROR( + runtime_error, + umpire::fmt::format( + "cudaMemcpy(dst={}, src={}, size={}, kind={}) failed with error: {}", + dst, src, size, static_cast(kind), cudaGetErrorString(error))); + } + } + + // Async version of copy for use with CUDA streams + template + inline camp::resources::Event copy_async_impl(T* src, T* dst, std::size_t len, camp::resources::Resource& r, cudaMemcpyKind kind) { + auto device = r.try_get(); + if (!device) { + UMPIRE_ERROR(resource_error, umpire::fmt::format("Expected resources::Cuda, got resources::{}", + platform_to_string(r.get_platform()))); + } + auto stream = device->get_stream(); + + std::size_t size = len; + if (!std::is_same::value) { + size = sizeof(T) * len; + } + + cudaError_t error = ::cudaMemcpyAsync(dst, src, size, kind, stream); + + if (error != cudaSuccess) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format("cudaMemcpyAsync(dst={}, src={}, size={}, kind={}, stream={}) failed with error: {}", + dst, src, size, static_cast(kind), + (void*)stream, cudaGetErrorString(error))); + } + + return camp::resources::EventProxy{r}; + } + + // Generic memset implementation + template + inline void memset_impl(T* ptr, int value, std::size_t len) { + std::size_t size = len; + if (!std::is_same::value) { + size = sizeof(T) * len; + } + + cudaError_t error = ::cudaMemset(ptr, value, size); + + if (error != cudaSuccess) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format("cudaMemset(ptr={}, value={}, size={}) failed with error: {}", + ptr, value, size, cudaGetErrorString(error))); + } + } + + // Async version of memset + template + inline camp::resources::Event memset_async_impl(T* ptr, int value, std::size_t len, camp::resources::Resource& r) { + auto device = r.try_get(); + if (!device) { + UMPIRE_ERROR(resource_error, umpire::fmt::format("Expected resources::Cuda, got resources::{}", + platform_to_string(r.get_platform()))); + } + auto stream = device->get_stream(); + + std::size_t size = len; + if (!std::is_same::value) { + size = sizeof(T) * len; + } + + cudaError_t error = ::cudaMemsetAsync(ptr, value, size, stream); + + if (error != cudaSuccess) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format( + "cudaMemsetAsync(ptr={}, value={}, size={}, stream={}) failed with error: {}", + ptr, value, size, (void*)stream, cudaGetErrorString(error))); + } + + return camp::resources::EventProxy{r}; + } +} + +// Copy operations for different platform combinations +template<> +struct copy +{ + template + static void exec(T* src, T* dst, std::size_t len) { + copy_impl(src, dst, len, cudaMemcpyDeviceToDevice); + } + + template + static camp::resources::Event exec(T* src, T* dst, std::size_t len, camp::resources::Resource& r) { + return copy_async_impl(src, dst, len, r, cudaMemcpyDeviceToDevice); + } +}; + +template<> +struct copy +{ + template + static void exec(T* src, T* dst, std::size_t len) { + copy_impl(src, dst, len, cudaMemcpyDeviceToHost); + } + + template + static camp::resources::Event exec(T* src, T* dst, std::size_t len, camp::resources::Resource& r) { + return copy_async_impl(src, dst, len, r, cudaMemcpyDeviceToHost); + } +}; + +template<> +struct copy +{ + template + static void exec(T* src, T* dst, std::size_t len) { + copy_impl(src, dst, len, cudaMemcpyHostToDevice); + } + + template + static camp::resources::Event exec(T* src, T* dst, std::size_t len, camp::resources::Resource& r) { + return copy_async_impl(src, dst, len, r, cudaMemcpyHostToDevice); + } +}; + +// Special handling for void pointers +template<> +template<> +inline void copy::exec(void* src, void* dst, std::size_t len) { + copy_impl(src, dst, len, cudaMemcpyDeviceToDevice); +} + +template<> +template<> +inline void copy::exec(void* src, void* dst, std::size_t len) { + copy_impl(src, dst, len, cudaMemcpyDeviceToHost); +} + +template<> +template<> +inline void copy::exec(void* src, void* dst, std::size_t len) { + copy_impl(src, dst, len, cudaMemcpyHostToDevice); +} + +// Memset operations +template<> +struct memset +{ + template + static void exec(T* src, int val, std::size_t len) { + memset_impl(src, val, len); + } + + template + static camp::resources::Event exec(T* src, int val, std::size_t len, camp::resources::Resource& r) { + return memset_async_impl(src, val, len, r); + } + + // Specialization for void* + template<> + static void exec(void* src, int val, std::size_t len) { + memset_impl(src, val, len); + } + + template<> + static camp::resources::Event exec(void* src, int val, std::size_t len, camp::resources::Resource& r) { + return memset_async_impl(src, val, len, r); + } +}; + +// Reallocate implementation +template<> +struct reallocate +{ + template + static T* exec(T* src, std::size_t size) { + if (!src) { + // This should allocate memory, but we can't do that directly here + // since we don't have access to the allocator + return nullptr; + } + + if (size == 0) { + // Should deallocate src and return nullptr + return nullptr; + } + + // This should be handled by the ResourceManager which has access to: + // 1. The AllocationRecord to get the original size + // 2. The Allocator to allocate new memory + + // For now, just return nullptr to indicate this needs to be + // handled at a higher level + return nullptr; + } +}; + +// Memory advice operations +template<> +struct accessed_by +{ + template + static inline void exec(T* src, int device, std::size_t len) { + advise_impl(src, len, device, cudaMemAdviseSetAccessedBy); + } + + template<> + static inline void exec(void* src, int device, std::size_t len) { + advise_impl(src, len, device, cudaMemAdviseSetAccessedBy); + } +}; + +template<> +struct preferred_location +{ + template + static inline void exec(T* src, int device, std::size_t len) { + advise_impl(src, len, device, cudaMemAdviseSetPreferredLocation); + } + + template<> + static inline void exec(void* src, int device, std::size_t len) { + advise_impl(src, len, device, cudaMemAdviseSetPreferredLocation); + } +}; + +template<> +struct read_mostly +{ + template + static inline void exec(T* src, int device, std::size_t len) { + advise_impl(src, len, device, cudaMemAdviseSetReadMostly); + } + + template<> + static inline void exec(void* src, int device, std::size_t len) { + advise_impl(src, len, device, cudaMemAdviseSetReadMostly); + } +}; + +template<> +struct unset_accessed_by +{ + template + static inline void exec(T* src, int device, std::size_t len) { + advise_impl(src, len, device, cudaMemAdviseUnsetAccessedBy); + } + + template<> + static inline void exec(void* src, int device, std::size_t len) { + advise_impl(src, len, device, cudaMemAdviseUnsetAccessedBy); + } +}; + +template<> +struct unset_preferred_location +{ + template + static inline void exec(T* src, int device, std::size_t len) { + advise_impl(src, len, device, cudaMemAdviseUnsetPreferredLocation); + } + + template<> + static inline void exec(void* src, int device, std::size_t len) { + advise_impl(src, len, device, cudaMemAdviseUnsetPreferredLocation); + } +}; + +template<> +struct unset_read_mostly +{ + template + static inline void exec(T* src, int device, std::size_t len) { + advise_impl(src, len, device, cudaMemAdviseUnsetReadMostly); + } + + template<> + static inline void exec(void* src, int device, std::size_t len) { + advise_impl(src, len, device, cudaMemAdviseUnsetReadMostly); + } +}; + +// Prefetch operations +template<> +struct prefetch +{ + template + static void exec(T* src, int device, std::size_t len) { + // Use current device for properties if device is CPU + int current_device; + cudaGetDevice(¤t_device); + int gpu = (device != cudaCpuDeviceId) ? device : current_device; + + if (check_device_managed_memory(gpu)) { + std::size_t size = len; + if (!std::is_same::value) { + size = sizeof(T) * len; + } + + cudaError_t error = ::cudaMemPrefetchAsync(src, size, device, nullptr); + + if (error != cudaSuccess) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format("cudaMemPrefetchAsync(ptr={}, size={}, device={}) failed with error: {}", + src, size, device, cudaGetErrorString(error))); + } + } + } + + template + static camp::resources::Event exec(T* src, int device, std::size_t len, camp::resources::Resource& r) { + auto cuda_device = r.try_get(); + if (!cuda_device) { + UMPIRE_ERROR(resource_error, umpire::fmt::format("Expected resources::Cuda, got resources::{}", + platform_to_string(r.get_platform()))); + } + auto stream = cuda_device->get_stream(); + + // Use current device for properties if device is CPU + int current_device; + cudaGetDevice(¤t_device); + int gpu = (device != cudaCpuDeviceId) ? device : current_device; + + if (check_device_managed_memory(gpu)) { + std::size_t size = len; + if (!std::is_same::value) { + size = sizeof(T) * len; + } + + cudaError_t error = ::cudaMemPrefetchAsync(src, size, device, stream); + + if (error != cudaSuccess) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format("cudaMemPrefetchAsync(ptr={}, size={}, device={}, stream={}) failed with error: {}", + src, size, device, (void*)stream, cudaGetErrorString(error))); + } + } + + return camp::resources::EventProxy{r}; + } + + // Specializations for void* + template<> + static void exec(void* src, int device, std::size_t len) { + // Use current device for properties if device is CPU + int current_device; + cudaGetDevice(¤t_device); + int gpu = (device != cudaCpuDeviceId) ? device : current_device; + + if (check_device_managed_memory(gpu)) { + cudaError_t error = ::cudaMemPrefetchAsync(src, len, device, nullptr); + + if (error != cudaSuccess) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format("cudaMemPrefetchAsync(ptr={}, size={}, device={}) failed with error: {}", + src, len, device, cudaGetErrorString(error))); + } + } + } + + template<> + static camp::resources::Event exec(void* src, int device, std::size_t len, camp::resources::Resource& r) { + auto cuda_device = r.try_get(); + if (!cuda_device) { + UMPIRE_ERROR(resource_error, umpire::fmt::format("Expected resources::Cuda, got resources::{}", + platform_to_string(r.get_platform()))); + } + auto stream = cuda_device->get_stream(); + + // Use current device for properties if device is CPU + int current_device; + cudaGetDevice(¤t_device); + int gpu = (device != cudaCpuDeviceId) ? device : current_device; + + if (check_device_managed_memory(gpu)) { + cudaError_t error = ::cudaMemPrefetchAsync(src, len, device, stream); + + if (error != cudaSuccess) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format("cudaMemPrefetchAsync(ptr={}, size={}, device={}, stream={}) failed with error: {}", + src, len, device, (void*)stream, cudaGetErrorString(error))); + } + } + + return camp::resources::EventProxy{r}; + } +}; + +} +} \ No newline at end of file diff --git a/include/umpire/op/dispatch.hpp b/include/umpire/op/dispatch.hpp new file mode 100644 index 000000000..ce26103ac --- /dev/null +++ b/include/umpire/op/dispatch.hpp @@ -0,0 +1,113 @@ +#pragma once + +#include "umpire/config.hpp" + +#include "umpire/resource/platform.hpp" + +#include "umpire/ResourceManager.hpp" + +namespace umpire { +namespace op { + + +template class Op> struct op_caller{}; + +template class Op> +struct op_caller<1, Op > { + template + inline static void exec(T* src, Args... args) { + auto& allocation_map = ResourceManager::getInstance().m_allocations; + auto src_record = allocation_map.find(src); + auto p = src_record->strategy->getPlatform(); + + // get src platform + if (p == camp::resources::Platform::host) { + Op::exec(src, args...); + } +#if defined(UMPIRE_ENABLE_CUDA) + else if (p == camp::resources::Platform::cuda) { + Op::exec(src, args...); + } +#endif +#if defined(UMPIRE_ENABLE_HIP) + else if (p == camp::resources::Platform::hip) { + Op::exec(src, args...); + } +#endif + } +}; + +template +struct count { + static constexpr std::size_t value = sizeof...(Ts); +}; + + +template class Op> +struct op_caller<2, Op> { + // try calling with Op::arity + template + inline static void exec(T* src, T* dst, Args... args) { + auto& allocation_map = ResourceManager::getInstance().m_allocations; + auto src_record = allocation_map.find(src); + auto dst_record = allocation_map.find(dst); + + auto p1 = src_record->strategy->getPlatform(); + auto p2 = dst_record->strategy->getPlatform(); + + // get src and dest platform + if ((p1 == p2) && (p1 == camp::resources::Platform::host)) { + return Op::exec(src, dst, args...); + } +#if defined(UMPIRE_ENABLE_CUDA) + if (p1 == p2 && (p1 == camp::resources::Platform::cuda)) { + Op::exec(src, dst, args...); + } else if (p1 == camp::resources::Platform::host && p2 == camp::resources::Platform::cuda) { + Op::exec(src, dst, args...); + } else if (p1 == camp::resources::Platform::cuda && p2 == camp::resources::Platform::host) { + Op::exec(src, dst, args...); + } +#endif +#if defined(UMPIRE_ENABLE_HIP) + if (p1 == p2 && (p1 == camp::resources::Platform::hip)) { + Op::exec(src, dst, args...); + } else if (p1 == camp::resources::Platform::host && p2 == camp::resources::Platform::hip) { + Op::exec(src, dst, args...); + } else if (p1 == camp::resources::Platform::hip && p2 == camp::resources::Platform::host) { + Op::exec(src, dst, args...); + } +#endif + } +}; + +} + +// template +// void copy(T* src, T* dst, std::size_t len) { +// op::copy::exec(src, dst, len); +// } + +template +void copy(T* src, T* dst, std::size_t len) { + op::op_caller<2, op::copy>::exec(src, dst, len); +} + +//template +//void copy(T* src, T* dst, std::size_t len, ) { +// op::op_caller<2, op::copy>::exec(src, dst, len); +//} + + + +// template +// void memset(T* a, T v, std::size_t len) { +// op::memset::exec(a, v, len); +// } +// +template +void memset(T* src, V v, std::size_t len) { + + op::op_caller<1, op::memset>::exec(src, v, len); +} + +} \ No newline at end of file diff --git a/include/umpire/op/hip.hpp b/include/umpire/op/hip.hpp new file mode 100644 index 000000000..7d2d13a0d --- /dev/null +++ b/include/umpire/op/hip.hpp @@ -0,0 +1,499 @@ +#pragma once + +#include "umpire/resource/platform.hpp" +#include "umpire/util/error.hpp" +#include "umpire/util/Platform.hpp" + +// Forward declaration of kernel for launching directly in device code if needed +extern "C" { +__global__ void +umpire_hip_fill(void* data, int value, std::size_t length); +} + +namespace { + template + struct get_kind; + + template<> + struct get_kind { + static constexpr hipMemcpyKind value = hipMemcpyDeviceToHost; + }; + + template<> + struct get_kind { + static constexpr hipMemcpyKind value = hipMemcpyHostToDevice; + }; + + template<> + struct get_kind { + static constexpr hipMemcpyKind value = hipMemcpyDeviceToDevice; + }; +} + +namespace umpire { +namespace op { + +namespace { + // Helper function to check if a HIP device supports managed memory features + inline bool check_device_managed_memory(int device) { + hipDeviceProp_t properties; + hipError_t error = ::hipGetDeviceProperties(&properties, device); + + if (error != hipSuccess) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format("hipGetDeviceProperties for device {} failed with error: {}", + device, hipGetErrorString(error))); + } + + return (properties.managedMemory == 1 && properties.concurrentManagedAccess == 1); + } + + // Generic function to handle hipMemAdvise operations + template + inline void advise_impl(T* ptr, std::size_t n, int device, hipMemoryAdvise advice) { + std::size_t size = n; + if (std::is_same::value) { + // void pointers don't have a size + } else { + size = sizeof(T) * n; + } + + if (check_device_managed_memory(device)) { + hipError_t error = ::hipMemAdvise(ptr, size, advice, device); + + if (error != hipSuccess) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format("hipMemAdvise(ptr={}, size={}, advice={}, device={}) failed with error: {}", + ptr, size, static_cast(advice), device, hipGetErrorString(error))); + } + } + } + + // Generic copy function that handles the different kinds of copies + template + inline void copy_impl(T* src, T* dst, std::size_t len, hipMemcpyKind kind) { + std::size_t size = len; + if (!std::is_same::value) { + size = sizeof(T) * len; + } + + hipError_t error = ::hipMemcpy(dst, src, size, kind); + if (error != hipSuccess) { + UMPIRE_ERROR( + runtime_error, + umpire::fmt::format( + "hipMemcpy(dst={}, src={}, size={}, kind={}) failed with error: {}", + dst, src, size, static_cast(kind), hipGetErrorString(error))); + } + } + + // Async version of copy for use with HIP streams + template + inline camp::resources::Event copy_async_impl(T* src, T* dst, std::size_t len, camp::resources::Resource& r, hipMemcpyKind kind) { + auto device = r.try_get(); + if (!device) { + UMPIRE_ERROR(resource_error, umpire::fmt::format("Expected resources::Hip, got resources::{}", + platform_to_string(r.get_platform()))); + } + auto stream = device->get_stream(); + + std::size_t size = len; + if (!std::is_same::value) { + size = sizeof(T) * len; + } + + hipError_t error = ::hipMemcpyAsync(dst, src, size, kind, stream); + + if (error != hipSuccess) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format("hipMemcpyAsync(dst={}, src={}, size={}, kind={}, stream={}) failed with error: {}", + dst, src, size, static_cast(kind), + (void*)stream, hipGetErrorString(error))); + } + + return camp::resources::EventProxy{r}; + } + + // Generic memset implementation + template + inline void memset_impl(T* ptr, int value, std::size_t len) { + std::size_t size = len; + if (!std::is_same::value) { + size = sizeof(T) * len; + } + + hipError_t error = ::hipMemset(ptr, value, size); + + if (error != hipSuccess) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format("hipMemset(ptr={}, value={}, size={}) failed with error: {}", + ptr, value, size, hipGetErrorString(error))); + } + } + + // Async version of memset + template + inline camp::resources::Event memset_async_impl(T* ptr, int value, std::size_t len, camp::resources::Resource& r) { + auto device = r.try_get(); + if (!device) { + UMPIRE_ERROR(resource_error, umpire::fmt::format("Expected resources::Hip, got resources::{}", + platform_to_string(r.get_platform()))); + } + auto stream = device->get_stream(); + + std::size_t size = len; + if (!std::is_same::value) { + size = sizeof(T) * len; + } + + hipError_t error = ::hipMemsetAsync(ptr, value, size, stream); + + if (error != hipSuccess) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format( + "hipMemsetAsync(ptr={}, value={}, size={}, stream={}) failed with error: {}", + ptr, value, size, (void*)stream, hipGetErrorString(error))); + } + + return camp::resources::EventProxy{r}; + } +} + +// Copy operations for different platform combinations +template<> +struct copy +{ + template + static void exec(T* src, T* dst, std::size_t len) { + copy_impl(src, dst, len, hipMemcpyDeviceToDevice); + } + + template + static camp::resources::Event exec(T* src, T* dst, std::size_t len, camp::resources::Resource& r) { + return copy_async_impl(src, dst, len, r, hipMemcpyDeviceToDevice); + } +}; + +template<> +struct copy +{ + template + static void exec(T* src, T* dst, std::size_t len) { + copy_impl(src, dst, len, hipMemcpyDeviceToHost); + } + + template + static camp::resources::Event exec(T* src, T* dst, std::size_t len, camp::resources::Resource& r) { + return copy_async_impl(src, dst, len, r, hipMemcpyDeviceToHost); + } +}; + +template<> +struct copy +{ + template + static void exec(T* src, T* dst, std::size_t len) { + copy_impl(src, dst, len, hipMemcpyHostToDevice); + } + + template + static camp::resources::Event exec(T* src, T* dst, std::size_t len, camp::resources::Resource& r) { + return copy_async_impl(src, dst, len, r, hipMemcpyHostToDevice); + } +}; + +// Special handling for void pointers +template<> +template<> +inline void copy::exec(void* src, void* dst, std::size_t len) { + copy_impl(src, dst, len, hipMemcpyDeviceToDevice); +} + +template<> +template<> +inline void copy::exec(void* src, void* dst, std::size_t len) { + copy_impl(src, dst, len, hipMemcpyDeviceToHost); +} + +template<> +template<> +inline void copy::exec(void* src, void* dst, std::size_t len) { + copy_impl(src, dst, len, hipMemcpyHostToDevice); +} + +// Memset operations +template<> +struct memset +{ + template + static void exec(T* src, int val, std::size_t len) { + memset_impl(src, val, len); + } + + template + static camp::resources::Event exec(T* src, int val, std::size_t len, camp::resources::Resource& r) { + return memset_async_impl(src, val, len, r); + } + + // Specialization for void* + template<> + static void exec(void* src, int val, std::size_t len) { + memset_impl(src, val, len); + } + + template<> + static camp::resources::Event exec(void* src, int val, std::size_t len, camp::resources::Resource& r) { + return memset_async_impl(src, val, len, r); + } +}; + +// Reallocate implementation +template<> +struct reallocate +{ + template + static T* exec(T* src, std::size_t size) { + if (!src) { + // This should allocate memory, but we can't do that directly here + // since we don't have access to the allocator + return nullptr; + } + + if (size == 0) { + // Should deallocate src and return nullptr + return nullptr; + } + + // This should be handled by the ResourceManager which has access to: + // 1. The AllocationRecord to get the original size + // 2. The Allocator to allocate new memory + + // For now, just return nullptr to indicate this needs to be + // handled at a higher level + return nullptr; + } +}; + +// Memory advice operations +template<> +struct accessed_by +{ + template + static inline void exec(T* src, int device, std::size_t len) { + advise_impl(src, len, device, hipMemAdviseSetAccessedBy); + } + + template<> + static inline void exec(void* src, int device, std::size_t len) { + advise_impl(src, len, device, hipMemAdviseSetAccessedBy); + } +}; + +template<> +struct preferred_location +{ + template + static inline void exec(T* src, int device, std::size_t len) { + advise_impl(src, len, device, hipMemAdviseSetPreferredLocation); + } + + template<> + static inline void exec(void* src, int device, std::size_t len) { + advise_impl(src, len, device, hipMemAdviseSetPreferredLocation); + } +}; + +template<> +struct read_mostly +{ + template + static inline void exec(T* src, int device, std::size_t len) { + advise_impl(src, len, device, hipMemAdviseSetReadMostly); + } + + template<> + static inline void exec(void* src, int device, std::size_t len) { + advise_impl(src, len, device, hipMemAdviseSetReadMostly); + } +}; + +template<> +struct unset_accessed_by +{ + template + static inline void exec(T* src, int device, std::size_t len) { + advise_impl(src, len, device, hipMemAdviseUnsetAccessedBy); + } + + template<> + static inline void exec(void* src, int device, std::size_t len) { + advise_impl(src, len, device, hipMemAdviseUnsetAccessedBy); + } +}; + +template<> +struct unset_preferred_location +{ + template + static inline void exec(T* src, int device, std::size_t len) { + advise_impl(src, len, device, hipMemAdviseUnsetPreferredLocation); + } + + template<> + static inline void exec(void* src, int device, std::size_t len) { + advise_impl(src, len, device, hipMemAdviseUnsetPreferredLocation); + } +}; + +template<> +struct unset_read_mostly +{ + template + static inline void exec(T* src, int device, std::size_t len) { + advise_impl(src, len, device, hipMemAdviseUnsetReadMostly); + } + + template<> + static inline void exec(void* src, int device, std::size_t len) { + advise_impl(src, len, device, hipMemAdviseUnsetReadMostly); + } +}; + +#if HIP_VERSION_MAJOR >= 5 +template<> +struct coarse_grain +{ + template + static inline void exec(T* src, int device, std::size_t len) { + advise_impl(src, len, device, hipMemAdviseSetCoarseGrain); + } + + template<> + static inline void exec(void* src, int device, std::size_t len) { + advise_impl(src, len, device, hipMemAdviseSetCoarseGrain); + } +}; + +template<> +struct unset_coarse_grain +{ + template + static inline void exec(T* src, int device, std::size_t len) { + advise_impl(src, len, device, hipMemAdviseUnsetCoarseGrain); + } + + template<> + static inline void exec(void* src, int device, std::size_t len) { + advise_impl(src, len, device, hipMemAdviseUnsetCoarseGrain); + } +}; +#endif + +// Prefetch operations +template<> +struct prefetch +{ + template + static void exec(T* src, int device, std::size_t len) { + // Use current device for properties if device is CPU + int current_device; + hipGetDevice(¤t_device); + int gpu = (device != hipCpuDeviceId) ? device : current_device; + + if (check_device_managed_memory(gpu)) { + std::size_t size = len; + if (!std::is_same::value) { + size = sizeof(T) * len; + } + + hipError_t error = ::hipMemPrefetchAsync(src, size, device, nullptr); + + if (error != hipSuccess) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format("hipMemPrefetchAsync(ptr={}, size={}, device={}) failed with error: {}", + src, size, device, hipGetErrorString(error))); + } + } + } + + template + static camp::resources::Event exec(T* src, int device, std::size_t len, camp::resources::Resource& r) { + auto hip_device = r.try_get(); + if (!hip_device) { + UMPIRE_ERROR(resource_error, umpire::fmt::format("Expected resources::Hip, got resources::{}", + platform_to_string(r.get_platform()))); + } + auto stream = hip_device->get_stream(); + + // Use current device for properties if device is CPU + int current_device; + hipGetDevice(¤t_device); + int gpu = (device != hipCpuDeviceId) ? device : current_device; + + if (check_device_managed_memory(gpu)) { + std::size_t size = len; + if (!std::is_same::value) { + size = sizeof(T) * len; + } + + hipError_t error = ::hipMemPrefetchAsync(src, size, device, stream); + + if (error != hipSuccess) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format("hipMemPrefetchAsync(ptr={}, size={}, device={}, stream={}) failed with error: {}", + src, size, device, (void*)stream, hipGetErrorString(error))); + } + } + + return camp::resources::EventProxy{r}; + } + + // Specializations for void* + template<> + static void exec(void* src, int device, std::size_t len) { + // Use current device for properties if device is CPU + int current_device; + hipGetDevice(¤t_device); + int gpu = (device != hipCpuDeviceId) ? device : current_device; + + if (check_device_managed_memory(gpu)) { + hipError_t error = ::hipMemPrefetchAsync(src, len, device, nullptr); + + if (error != hipSuccess) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format("hipMemPrefetchAsync(ptr={}, size={}, device={}) failed with error: {}", + src, len, device, hipGetErrorString(error))); + } + } + } + + template<> + static camp::resources::Event exec(void* src, int device, std::size_t len, camp::resources::Resource& r) { + auto hip_device = r.try_get(); + if (!hip_device) { + UMPIRE_ERROR(resource_error, umpire::fmt::format("Expected resources::Hip, got resources::{}", + platform_to_string(r.get_platform()))); + } + auto stream = hip_device->get_stream(); + + // Use current device for properties if device is CPU + int current_device; + hipGetDevice(¤t_device); + int gpu = (device != hipCpuDeviceId) ? device : current_device; + + if (check_device_managed_memory(gpu)) { + hipError_t error = ::hipMemPrefetchAsync(src, len, device, stream); + + if (error != hipSuccess) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format("hipMemPrefetchAsync(ptr={}, size={}, device={}, stream={}) failed with error: {}", + src, len, device, (void*)stream, hipGetErrorString(error))); + } + } + + return camp::resources::EventProxy{r}; + } +}; + +} +} \ No newline at end of file diff --git a/include/umpire/op/host.hpp b/include/umpire/op/host.hpp new file mode 100644 index 000000000..8531bf3e0 --- /dev/null +++ b/include/umpire/op/host.hpp @@ -0,0 +1,153 @@ +#pragma once + +#include "umpire/resource/platform.hpp" +#include "umpire/util/error.hpp" + +#include + +namespace umpire { +namespace op { + +namespace { + // Generic implementation of host copy + template + inline void copy_impl(T* src, T* dst, std::size_t len) { + std::memcpy(dst, src, len * sizeof(T)); + } + + // Specialization for void* + template<> + inline void copy_impl(void* src, void* dst, std::size_t len) { + std::memcpy(dst, src, len); + } + + // Generic implementation of host memset + template + inline void memset_impl(T* src, int val, std::size_t len) { + std::memset(src, val, sizeof(T) * len); + } + + // Specialization for void* + template<> + inline void memset_impl(void* src, int val, std::size_t len) { + std::memset(src, val, len); + } +} + +// Host-to-host copy operation +template<> +struct copy +{ + template + static void exec(T* src, T* dst, std::size_t len) { + copy_impl(src, dst, len); + } + + // Async version returns a dummy event + template + static camp::resources::Event exec(T* src, T* dst, std::size_t len, camp::resources::Resource& r) { + copy_impl(src, dst, len); + return camp::resources::EventProxy{r}; + } + + // Specialization for void* + template<> + static void exec(void* src, void* dst, std::size_t len) { + copy_impl(src, dst, len); + } + + template<> + static camp::resources::Event exec(void* src, void* dst, std::size_t len, camp::resources::Resource& r) { + copy_impl(src, dst, len); + return camp::resources::EventProxy{r}; + } +}; + +// Host memset operation +template<> +struct memset +{ + template + static void exec(T* src, int val, std::size_t len) { + memset_impl(src, val, len); + } + + // Async version returns a dummy event + template + static camp::resources::Event exec(T* src, int val, std::size_t len, camp::resources::Resource& r) { + memset_impl(src, val, len); + return camp::resources::EventProxy{r}; + } + + // Specialization for void* + template<> + static void exec(void* src, int val, std::size_t len) { + memset_impl(src, val, len); + } + + template<> + static camp::resources::Event exec(void* src, int val, std::size_t len, camp::resources::Resource& r) { + memset_impl(src, val, len); + return camp::resources::EventProxy{r}; + } +}; + +// Host reallocate operation - uses system realloc +template<> +struct reallocate +{ + template + static T* exec(T* src, std::size_t size) { + if (!src) { + // Return nullptr for nullptr input + return nullptr; + } + + if (size == 0) { + if (src) { + // Free memory for zero-sized allocation + std::free(src); + } + return nullptr; + } + + // Use standard realloc for host memory + T* ret = static_cast(std::realloc(src, size * sizeof(T))); + + if (!ret && size > 0) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format("Host realloc failed for pointer={}, size={}", + src, size * sizeof(T))); + } + + return ret; + } + + // Specialization for void* to handle size correctly + template<> + static void* exec(void* src, std::size_t size) { + if (!src) { + return nullptr; + } + + if (size == 0) { + if (src) { + std::free(src); + } + return nullptr; + } + + void* ret = std::realloc(src, size); + + if (!ret && size > 0) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format("Host realloc failed for pointer={}, size={}", + src, size)); + } + + return ret; + } +}; + +} +} \ No newline at end of file diff --git a/include/umpire/op/operations.hpp b/include/umpire/op/operations.hpp new file mode 100644 index 000000000..e10b16b8f --- /dev/null +++ b/include/umpire/op/operations.hpp @@ -0,0 +1,78 @@ +#pragma once + +namespace umpire { +namespace op { + +struct operation { + static constexpr int arity = -1; +}; + +template +struct copy : public operation { + static constexpr int arity = 2; +}; + +template +struct memset : public operation { + static constexpr int arity = 1; +}; + +template +struct reallocate : public operation { + static constexpr int arity = 1; +}; + +template +struct advise : public operation { + static constexpr int arity = 1; +}; + +template +struct accessed_by : public operation { + static constexpr int arity = 1; +}; + +template +struct preferred_location : public operation { + static constexpr int arity = 1; +}; + +template +struct read_mostly : public operation { + static constexpr int arity = 1; +}; + +template +struct unset_accessed_by : public operation { + static constexpr int arity = 1; +}; + +template +struct unset_preferred_location : public operation { + static constexpr int arity = 1; +}; + +template +struct unset_read_mostly : public operation { + static constexpr int arity = 1; +}; + +#if (defined(UMPIRE_ENABLE_HIP) && HIP_VERSION_MAJOR >= 5) || defined(UMPIRE_ENABLE_CUDA) +template +struct coarse_grain : public operation { + static constexpr int arity = 1; +}; + +template +struct unset_coarse_grain : public operation { + static constexpr int arity = 1; +}; +#endif + +template +struct prefetch : public operation { + static constexpr int arity = 1; +}; + +} +} \ No newline at end of file diff --git a/include/umpire/resource/platform.hpp b/include/umpire/resource/platform.hpp new file mode 100644 index 000000000..bfe391358 --- /dev/null +++ b/include/umpire/resource/platform.hpp @@ -0,0 +1,56 @@ +#pragma once + +#include "umpire/config.hpp" + +#include "camp/resource/platform.hpp" + +namespace umpire { +namespace resource { + +template +struct platform_for {}; + +struct undefined_platform {}; +struct host_platform {}; +#if defined(UMPIRE_ENABLE_CUDA) +struct cuda_platform {}; +#endif +#if defined(UMPIRE_ENABLE_HIP) +struct hip_platform {}; +#endif +#if defined(UMPIRE_ENABLE_OPENMP_TARGET) +struct omp_target_platform {}; +#endif + +template<> +struct platform_for { + static constexpr camp::resources::Platform value = camp::resources::Platform::undefined; +}; + +template<> +struct platform_for { + static constexpr camp::resources::Platform value = camp::resources::Platform::host; +}; + +#if defined(UMPIRE_ENABLE_CUDA) +template<> +struct platform_for { + static constexpr camp::resources::Platform camp::resources::Platform::cuda; +} +#endif +#if defined(UMPIRE_ENABLE_HIP) +template<> +struct platform_for { + static constexpr camp::resources::Platform camp::resources::Platform::hip; +} +#endif +#if defined(UMPIRE_ENABLE_OPENMP_TARGET) +template<> +struct platform_for { + static constexpr camp::resources::Platform camp::resources::Platform::omp_target; +} +#endif + + +} +} \ No newline at end of file diff --git a/src/umpire/CMakeLists.txt b/src/umpire/CMakeLists.txt index 39b181f2a..a17ce9063 100644 --- a/src/umpire/CMakeLists.txt +++ b/src/umpire/CMakeLists.txt @@ -190,6 +190,7 @@ target_include_directories( umpire PUBLIC $ + $ $ $) diff --git a/src/umpire/ResourceManager.cpp b/src/umpire/ResourceManager.cpp index 76c89a24f..3f0c5d551 100644 --- a/src/umpire/ResourceManager.cpp +++ b/src/umpire/ResourceManager.cpp @@ -40,6 +40,8 @@ #include "umpire/util/sycl_compat.hpp" #endif +#include "umpire/op.hpp" + static const char* s_null_resource_name{"__umpire_internal_null"}; static const char* s_zero_byte_pool_name{"__umpire_internal_0_byte_pool"}; @@ -390,7 +392,7 @@ void ResourceManager::copy(void* dst_ptr, void* src_ptr, std::size_t size) { UMPIRE_LOG(Debug, "(src_ptr=" << src_ptr << ", dst_ptr=" << dst_ptr << ", size=" << size << ")"); - auto& op_registry = op::MemoryOperationRegistry::getInstance(); + //auto& op_registry = op::MemoryOperationRegistry::getInstance(); auto src_alloc_record = m_allocations.find(src_ptr); std::ptrdiff_t src_offset = static_cast(src_ptr) - static_cast(src_alloc_record->ptr); @@ -404,29 +406,27 @@ void ResourceManager::copy(void* dst_ptr, void* src_ptr, std::size_t size) size = src_size; } - umpire::event::record([&](auto& event) { - event.name("copy") - .category(event::category::operation) - .arg("src", src_ptr) - .arg("dst", dst_ptr) - .arg("src_offset", src_offset) - .arg("dst_offset", dst_offset) - .arg("size", size) - .arg("src_allocator_ref", (void*)src_alloc_record->strategy) - .arg("dst_allocator_ref", (void*)dst_alloc_record->strategy) - .tag("src_allocator_name", src_alloc_record->strategy->getName()) - .tag("dst_allocator_name", dst_alloc_record->strategy->getName()) - .tag("replay", "true"); - }); + //umpire::event::record([&](auto& event) { + // event.name("copy") + // .category(event::category::operation) + // .arg("src", src_ptr) + // .arg("dst", dst_ptr) + // .arg("src_offset", src_offset) + // .arg("dst_offset", dst_offset) + // .arg("size", size) + // .arg("src_allocator_ref", (void*)src_alloc_record->strategy) + // .arg("dst_allocator_ref", (void*)dst_alloc_record->strategy) + // .tag("src_allocator_name", src_alloc_record->strategy->getName()) + // .tag("dst_allocator_name", dst_alloc_record->strategy->getName()) + // .tag("replay", "true"); + //}); if (size > dst_size) { UMPIRE_ERROR(runtime_error, fmt::format("Not enough space in destination to copy {} bytes into {} bytes", size, dst_size)); } - auto op = op_registry.find("COPY", src_alloc_record->strategy, dst_alloc_record->strategy); - - op->transform(src_ptr, &dst_ptr, src_alloc_record, dst_alloc_record, size); + umpire::copy(src_ptr, dst_ptr, size); } camp::resources::EventProxy ResourceManager::copy(void* dst_ptr, void* src_ptr, @@ -504,9 +504,10 @@ void ResourceManager::memset(void* ptr, int value, std::size_t length) UMPIRE_ERROR(runtime_error, fmt::format("Cannot memset over the end of allocation: {} -> {}", length, size)); } - auto op = op_registry.find("MEMSET", alloc_record->strategy, alloc_record->strategy); + //auto op = op_registry.find("MEMSET", alloc_record->strategy, alloc_record->strategy); - op->apply(ptr, alloc_record, value, length); + //op->apply(ptr, alloc_record, value, length); + umpire::memset(ptr, value, length); } camp::resources::EventProxy ResourceManager::memset(void* ptr, int value, diff --git a/src/umpire/ResourceManager.hpp b/src/umpire/ResourceManager.hpp index 71a93b32c..ae37bb7ce 100644 --- a/src/umpire/ResourceManager.hpp +++ b/src/umpire/ResourceManager.hpp @@ -25,6 +25,9 @@ namespace umpire { namespace op { class MemoryOperation; + +template class Op> struct op_caller; + } namespace strategy { @@ -359,6 +362,9 @@ class ResourceManager { friend std::vector get_allocator_records(Allocator); friend strategy::ZeroByteHandler; friend strategy::mixins::AllocateNull; + + template class Op> + friend struct umpire::op::op_caller; }; } // end namespace umpire From 07bbc3d560b3e0921423c501549b6e6e743b84cf Mon Sep 17 00:00:00 2001 From: David Beckingsale Date: Mon, 31 Mar 2025 14:39:05 -0700 Subject: [PATCH 02/26] Update RM to use more operations --- include/umpire/op/cuda.hpp | 18 ++--- include/umpire/op/dispatch.hpp | 121 +++++++++++++++++++++++++--- include/umpire/op/hip.hpp | 2 +- include/umpire/op/host.hpp | 8 +- src/umpire/ResourceManager.cpp | 141 +++++++++++++++++++++++---------- 5 files changed, 224 insertions(+), 66 deletions(-) diff --git a/include/umpire/op/cuda.hpp b/include/umpire/op/cuda.hpp index f2fa13aff..8c5f5d6f4 100644 --- a/include/umpire/op/cuda.hpp +++ b/include/umpire/op/cuda.hpp @@ -89,7 +89,7 @@ namespace { // Async version of copy for use with CUDA streams template - inline camp::resources::Event copy_async_impl(T* src, T* dst, std::size_t len, camp::resources::Resource& r, cudaMemcpyKind kind) { + inline camp::resources::EventProxy copy_async_impl(T* src, T* dst, std::size_t len, camp::resources::Resource& r, cudaMemcpyKind kind) { auto device = r.try_get(); if (!device) { UMPIRE_ERROR(resource_error, umpire::fmt::format("Expected resources::Cuda, got resources::{}", @@ -133,7 +133,7 @@ namespace { // Async version of memset template - inline camp::resources::Event memset_async_impl(T* ptr, int value, std::size_t len, camp::resources::Resource& r) { + inline camp::resources::EventProxy memset_async_impl(T* ptr, int value, std::size_t len, camp::resources::Resource& r) { auto device = r.try_get(); if (!device) { UMPIRE_ERROR(resource_error, umpire::fmt::format("Expected resources::Cuda, got resources::{}", @@ -169,7 +169,7 @@ struct copy } template - static camp::resources::Event exec(T* src, T* dst, std::size_t len, camp::resources::Resource& r) { + static camp::resources::EventProxy exec(T* src, T* dst, std::size_t len, camp::resources::Resource& r) { return copy_async_impl(src, dst, len, r, cudaMemcpyDeviceToDevice); } }; @@ -183,7 +183,7 @@ struct copy } template - static camp::resources::Event exec(T* src, T* dst, std::size_t len, camp::resources::Resource& r) { + static camp::resources::EventProxy exec(T* src, T* dst, std::size_t len, camp::resources::Resource& r) { return copy_async_impl(src, dst, len, r, cudaMemcpyDeviceToHost); } }; @@ -197,7 +197,7 @@ struct copy } template - static camp::resources::Event exec(T* src, T* dst, std::size_t len, camp::resources::Resource& r) { + static camp::resources::EventProxy exec(T* src, T* dst, std::size_t len, camp::resources::Resource& r) { return copy_async_impl(src, dst, len, r, cudaMemcpyHostToDevice); } }; @@ -231,7 +231,7 @@ struct memset } template - static camp::resources::Event exec(T* src, int val, std::size_t len, camp::resources::Resource& r) { + static camp::resources::EventProxy exec(T* src, int val, std::size_t len, camp::resources::Resource& r) { return memset_async_impl(src, val, len, r); } @@ -242,7 +242,7 @@ struct memset } template<> - static camp::resources::Event exec(void* src, int val, std::size_t len, camp::resources::Resource& r) { + static camp::resources::EventProxy exec(void* src, int val, std::size_t len, camp::resources::Resource& r) { return memset_async_impl(src, val, len, r); } }; @@ -387,7 +387,7 @@ struct prefetch } template - static camp::resources::Event exec(T* src, int device, std::size_t len, camp::resources::Resource& r) { + static camp::resources::EventProxy exec(T* src, int device, std::size_t len, camp::resources::Resource& r) { auto cuda_device = r.try_get(); if (!cuda_device) { UMPIRE_ERROR(resource_error, umpire::fmt::format("Expected resources::Cuda, got resources::{}", @@ -438,7 +438,7 @@ struct prefetch } template<> - static camp::resources::Event exec(void* src, int device, std::size_t len, camp::resources::Resource& r) { + static camp::resources::EventProxy exec(void* src, int device, std::size_t len, camp::resources::Resource& r) { auto cuda_device = r.try_get(); if (!cuda_device) { UMPIRE_ERROR(resource_error, umpire::fmt::format("Expected resources::Cuda, got resources::{}", diff --git a/include/umpire/op/dispatch.hpp b/include/umpire/op/dispatch.hpp index ce26103ac..082ec75b2 100644 --- a/include/umpire/op/dispatch.hpp +++ b/include/umpire/op/dispatch.hpp @@ -92,22 +92,121 @@ void copy(T* src, T* dst, std::size_t len) { op::op_caller<2, op::copy>::exec(src, dst, len); } -//template -//void copy(T* src, T* dst, std::size_t len, ) { -// op::op_caller<2, op::copy>::exec(src, dst, len); -//} - +template +camp::resources::EventProxy copy(T* src, T* dst, camp::resources::Resource& ctx, std::size_t len) { + auto& allocation_map = ResourceManager::getInstance().m_allocations; + auto src_record = allocation_map.find(src); + auto dst_record = allocation_map.find(dst); + auto p1 = src_record->strategy->getPlatform(); + auto p2 = dst_record->strategy->getPlatform(); + + // get src and dest platform + if ((p1 == p2) && (p1 == camp::resources::Platform::host)) { + return op::copy::exec(src, dst, len, ctx); + } +#if defined(UMPIRE_ENABLE_CUDA) + if (p1 == p2 && (p1 == camp::resources::Platform::cuda)) { + return op::copy::exec(src, dst, len, ctx); + } else if (p1 == camp::resources::Platform::host && p2 == camp::resources::Platform::cuda) { + return op::copy::exec(src, dst, len, ctx); + } else if (p1 == camp::resources::Platform::cuda && p2 == camp::resources::Platform::host) { + return op::copy::exec(src, dst, len, ctx); + } +#endif +#if defined(UMPIRE_ENABLE_HIP) + if (p1 == p2 && (p1 == camp::resources::Platform::hip)) { + return op::copy::exec(src, dst, len, ctx); + } else if (p1 == camp::resources::Platform::host && p2 == camp::resources::Platform::hip) { + return op::copy::exec(src, dst, len, ctx); + } else if (p1 == camp::resources::Platform::hip && p2 == camp::resources::Platform::host) { + return op::copy::exec(src, dst, len, ctx); + } +#endif + + UMPIRE_ERROR(runtime_error, + fmt::format("Unknown platforms for copy: src={}, dst={}", + static_cast(p1), static_cast(p2))); + + // Unreachable, but needed to satisfy compiler + return camp::resources::EventProxy{ctx}; +} -// template -// void memset(T* a, T v, std::size_t len) { -// op::memset::exec(a, v, len); -// } -// template void memset(T* src, V v, std::size_t len) { - op::op_caller<1, op::memset>::exec(src, v, len); } +template +camp::resources::EventProxy memset(T* src, int v, camp::resources::Resource& ctx, std::size_t len) { + auto& allocation_map = ResourceManager::getInstance().m_allocations; + auto src_record = allocation_map.find(src); + auto p = src_record->strategy->getPlatform(); + + if (p == camp::resources::Platform::host) { + return op::memset::exec(src, v, len, ctx); + } +#if defined(UMPIRE_ENABLE_CUDA) + else if (p == camp::resources::Platform::cuda) { + return op::memset::exec(src, v, len, ctx); + } +#endif +#if defined(UMPIRE_ENABLE_HIP) + else if (p == camp::resources::Platform::hip) { + return op::memset::exec(src, v, len, ctx); + } +#endif + + UMPIRE_ERROR(runtime_error, + fmt::format("Unknown platform for memset: platform={}", + static_cast(p))); + + // Unreachable, but needed to satisfy compiler + return camp::resources::EventProxy{ctx}; +} + +template +T* reallocate(T* src, std::size_t size) { + // Template-based reallocate is a placeholder for now + // Need the ResourceManager to handle allocations/deallocations + // and allocation record tracking + return nullptr; +} + +template +camp::resources::EventProxy reallocate(T* src, std::size_t size, camp::resources::Resource& ctx) { + // Template-based reallocate is a placeholder for now + // Need the ResourceManager to handle allocations/deallocations + // and allocation record tracking with async support + + // Placeholder return to satisfy compiler + return camp::resources::EventProxy{ctx}; +} + +template +camp::resources::EventProxy prefetch(T* ptr, int device, camp::resources::Resource& ctx, std::size_t size) { + auto& allocation_map = ResourceManager::getInstance().m_allocations; + auto ptr_record = allocation_map.find(ptr); + auto p = ptr_record->strategy->getPlatform(); + + // Currently only CUDA and HIP platforms support prefetch +#if defined(UMPIRE_ENABLE_CUDA) + if (p == camp::resources::Platform::cuda) { + return op::prefetch::exec(ptr, device, size, ctx); + } +#endif +#if defined(UMPIRE_ENABLE_HIP) + if (p == camp::resources::Platform::hip) { + return op::prefetch::exec(ptr, device, size, ctx); + } +#endif + + UMPIRE_ERROR(runtime_error, + fmt::format("Prefetch not supported for platform: {}", + static_cast(p))); + + // Unreachable, but needed to satisfy compiler + return camp::resources::EventProxy{ctx}; +} + } \ No newline at end of file diff --git a/include/umpire/op/hip.hpp b/include/umpire/op/hip.hpp index 7d2d13a0d..2706d61e6 100644 --- a/include/umpire/op/hip.hpp +++ b/include/umpire/op/hip.hpp @@ -89,7 +89,7 @@ namespace { // Async version of copy for use with HIP streams template - inline camp::resources::Event copy_async_impl(T* src, T* dst, std::size_t len, camp::resources::Resource& r, hipMemcpyKind kind) { + inline camp::resources::EventProxy copy_async_impl(T* src, T* dst, std::size_t len, camp::resources::Resource& r, hipMemcpyKind kind) { auto device = r.try_get(); if (!device) { UMPIRE_ERROR(resource_error, umpire::fmt::format("Expected resources::Hip, got resources::{}", diff --git a/include/umpire/op/host.hpp b/include/umpire/op/host.hpp index 8531bf3e0..12169ba22 100644 --- a/include/umpire/op/host.hpp +++ b/include/umpire/op/host.hpp @@ -45,7 +45,7 @@ struct copy // Async version returns a dummy event template - static camp::resources::Event exec(T* src, T* dst, std::size_t len, camp::resources::Resource& r) { + static camp::resources::EventProxy exec(T* src, T* dst, std::size_t len, camp::resources::Resource& r) { copy_impl(src, dst, len); return camp::resources::EventProxy{r}; } @@ -57,7 +57,7 @@ struct copy } template<> - static camp::resources::Event exec(void* src, void* dst, std::size_t len, camp::resources::Resource& r) { + static camp::resources::EventProxy exec(void* src, void* dst, std::size_t len, camp::resources::Resource& r) { copy_impl(src, dst, len); return camp::resources::EventProxy{r}; } @@ -74,7 +74,7 @@ struct memset // Async version returns a dummy event template - static camp::resources::Event exec(T* src, int val, std::size_t len, camp::resources::Resource& r) { + static camp::resources::EventProxy exec(T* src, int val, std::size_t len, camp::resources::Resource& r) { memset_impl(src, val, len); return camp::resources::EventProxy{r}; } @@ -86,7 +86,7 @@ struct memset } template<> - static camp::resources::Event exec(void* src, int val, std::size_t len, camp::resources::Resource& r) { + static camp::resources::EventProxy exec(void* src, int val, std::size_t len, camp::resources::Resource& r) { memset_impl(src, val, len); return camp::resources::EventProxy{r}; } diff --git a/src/umpire/ResourceManager.cpp b/src/umpire/ResourceManager.cpp index 3f0c5d551..670bc851a 100644 --- a/src/umpire/ResourceManager.cpp +++ b/src/umpire/ResourceManager.cpp @@ -392,8 +392,6 @@ void ResourceManager::copy(void* dst_ptr, void* src_ptr, std::size_t size) { UMPIRE_LOG(Debug, "(src_ptr=" << src_ptr << ", dst_ptr=" << dst_ptr << ", size=" << size << ")"); - //auto& op_registry = op::MemoryOperationRegistry::getInstance(); - auto src_alloc_record = m_allocations.find(src_ptr); std::ptrdiff_t src_offset = static_cast(src_ptr) - static_cast(src_alloc_record->ptr); std::size_t src_size = src_alloc_record->size - src_offset; @@ -406,27 +404,28 @@ void ResourceManager::copy(void* dst_ptr, void* src_ptr, std::size_t size) size = src_size; } - //umpire::event::record([&](auto& event) { - // event.name("copy") - // .category(event::category::operation) - // .arg("src", src_ptr) - // .arg("dst", dst_ptr) - // .arg("src_offset", src_offset) - // .arg("dst_offset", dst_offset) - // .arg("size", size) - // .arg("src_allocator_ref", (void*)src_alloc_record->strategy) - // .arg("dst_allocator_ref", (void*)dst_alloc_record->strategy) - // .tag("src_allocator_name", src_alloc_record->strategy->getName()) - // .tag("dst_allocator_name", dst_alloc_record->strategy->getName()) - // .tag("replay", "true"); - //}); + umpire::event::record([&](auto& event) { + event.name("copy") + .category(event::category::operation) + .arg("src", src_ptr) + .arg("dst", dst_ptr) + .arg("src_offset", src_offset) + .arg("dst_offset", dst_offset) + .arg("size", size) + .arg("src_allocator_ref", (void*)src_alloc_record->strategy) + .arg("dst_allocator_ref", (void*)dst_alloc_record->strategy) + .tag("src_allocator_name", src_alloc_record->strategy->getName()) + .tag("dst_allocator_name", dst_alloc_record->strategy->getName()) + .tag("replay", "true"); + }); if (size > dst_size) { UMPIRE_ERROR(runtime_error, fmt::format("Not enough space in destination to copy {} bytes into {} bytes", size, dst_size)); } - umpire::copy(src_ptr, dst_ptr, size); + // Use the template-based copy operation + umpire::copy(static_cast(src_ptr), static_cast(dst_ptr), size); } camp::resources::EventProxy ResourceManager::copy(void* dst_ptr, void* src_ptr, @@ -435,8 +434,6 @@ camp::resources::EventProxy ResourceManager::copy(voi { UMPIRE_LOG(Debug, "(src_ptr=" << src_ptr << ", dst_ptr=" << dst_ptr << ", size=" << size << ")"); - auto& op_registry = op::MemoryOperationRegistry::getInstance(); - auto src_alloc_record = m_allocations.find(src_ptr); std::ptrdiff_t src_offset = static_cast(src_ptr) - static_cast(src_alloc_record->ptr); std::size_t src_size = src_alloc_record->size - src_offset; @@ -469,17 +466,19 @@ camp::resources::EventProxy ResourceManager::copy(voi UMPIRE_ERROR(runtime_error, fmt::format("Not enough resource in destination for copy: {} -> {}", size, dst_size)); } - auto op = op_registry.find("COPY", src_alloc_record->strategy, dst_alloc_record->strategy); - - return op->transform_async(src_ptr, &dst_ptr, src_alloc_record, dst_alloc_record, size, ctx); + // Use the template-based async copy operation directly + return umpire::copy(static_cast(src_ptr), static_cast(dst_ptr), ctx, size); + + // If there are issues with the template-based implementation, fall back to the class-based one: + // auto& op_registry = op::MemoryOperationRegistry::getInstance(); + // auto op = op_registry.find("COPY", src_alloc_record->strategy, dst_alloc_record->strategy); + // return op->transform_async(src_ptr, &dst_ptr, src_alloc_record, dst_alloc_record, size, ctx); } void ResourceManager::memset(void* ptr, int value, std::size_t length) { UMPIRE_LOG(Debug, "(ptr=" << ptr << ", value=" << value << ", length=" << length << ")"); - auto& op_registry = op::MemoryOperationRegistry::getInstance(); - auto alloc_record = m_allocations.find(ptr); std::ptrdiff_t offset = static_cast(ptr) - static_cast(alloc_record->ptr); @@ -504,10 +503,8 @@ void ResourceManager::memset(void* ptr, int value, std::size_t length) UMPIRE_ERROR(runtime_error, fmt::format("Cannot memset over the end of allocation: {} -> {}", length, size)); } - //auto op = op_registry.find("MEMSET", alloc_record->strategy, alloc_record->strategy); - - //op->apply(ptr, alloc_record, value, length); - umpire::memset(ptr, value, length); + // Use the template-based memset operation + umpire::memset(static_cast(ptr), value, length); } camp::resources::EventProxy ResourceManager::memset(void* ptr, int value, @@ -516,8 +513,6 @@ camp::resources::EventProxy ResourceManager::memset(v { UMPIRE_LOG(Debug, "(ptr=" << ptr << ", value=" << value << ", length=" << length << ")"); - auto& op_registry = op::MemoryOperationRegistry::getInstance(); - auto alloc_record = m_allocations.find(ptr); std::ptrdiff_t offset = static_cast(ptr) - static_cast(alloc_record->ptr); @@ -543,9 +538,13 @@ camp::resources::EventProxy ResourceManager::memset(v UMPIRE_ERROR(runtime_error, fmt::format("Cannot memset over the end of allocation: {} -> {}", length, size)); } - auto op = op_registry.find("MEMSET", alloc_record->strategy, alloc_record->strategy); - - return op->apply_async(ptr, alloc_record, value, length, ctx); + // Use the template-based async memset operation directly + return umpire::memset(static_cast(ptr), value, ctx, length); + + // If there are issues with the template-based implementation, fall back to the class-based one: + // auto& op_registry = op::MemoryOperationRegistry::getInstance(); + // auto op = op_registry.find("MEMSET", alloc_record->strategy, alloc_record->strategy); + // return op->apply_async(ptr, alloc_record, value, length, ctx); } void* ResourceManager::reallocate(void* current_ptr, std::size_t new_size) @@ -697,14 +696,17 @@ void* ResourceManager::reallocate_impl(void* current_ptr, std::size_t new_size, alloc.deallocate(current_ptr); new_ptr = alloc.allocate(new_size); } else { - auto& op_registry = op::MemoryOperationRegistry::getInstance(); - + // Check for offset pointer if (current_ptr != alloc_record->ptr) { UMPIRE_ERROR(runtime_error, fmt::format("Cannot reallocate an offset ptr (ptr={}, base={})", current_ptr, alloc_record->ptr)); } - + + // During transition we need to use the MemoryOperationRegistry for the actual reallocation + // since our template-based implementation doesn't have access to the allocation records + auto& op_registry = op::MemoryOperationRegistry::getInstance(); std::shared_ptr op; + if (alloc_record->strategy->getPlatform() == Platform::host && getAllocator("HOST").getId() != alloc_record->strategy->getId()) { op = op_registry.find("REALLOCATE", std::make_pair(Platform::undefined, Platform::undefined)); @@ -713,6 +715,16 @@ void* ResourceManager::reallocate_impl(void* current_ptr, std::size_t new_size, } op->transform(current_ptr, &new_ptr, alloc_record, alloc_record, new_size); + + // In the future, when the template implementation is ready: + // auto platform = alloc_record->strategy->getPlatform(); + // if (platform == Platform::host) { + // new_ptr = umpire::reallocate(current_ptr, new_size); + // } else if (platform == Platform::cuda) { + // new_ptr = umpire::reallocate(current_ptr, new_size); + // } else if (platform == Platform::hip) { + // new_ptr = umpire::reallocate(current_ptr, new_size); + // } } } @@ -748,22 +760,50 @@ void* ResourceManager::reallocate_impl(void* current_ptr, std::size_t new_size, alloc.deallocate(current_ptr); new_ptr = alloc.allocate(new_size); } else { - auto& op_registry = op::MemoryOperationRegistry::getInstance(); - + // Check for offset pointer if (current_ptr != alloc_record->ptr) { UMPIRE_ERROR(runtime_error, fmt::format("Cannot reallocate an offset ptr (ptr={}, base={})", current_ptr, alloc_record->ptr)); } - + + // During transition we need to use the MemoryOperationRegistry for the actual reallocation + // since our template-based implementation doesn't have access to the allocation records + auto& op_registry = op::MemoryOperationRegistry::getInstance(); std::shared_ptr op; + if (alloc_record->strategy->getPlatform() == Platform::host && getAllocator("HOST").getId() != alloc_record->strategy->getId()) { op = op_registry.find("REALLOCATE", std::make_pair(Platform::undefined, Platform::undefined)); op->transform(current_ptr, &new_ptr, alloc_record, alloc_record, new_size); } else { op = op_registry.find("REALLOCATE", alloc_record->strategy, alloc_record->strategy); - op->transform_async(current_ptr, &new_ptr, alloc_record, alloc_record, new_size, ctx); + // Use async transform for async reallocate + auto event = op->transform_async(current_ptr, &new_ptr, alloc_record, alloc_record, new_size, ctx); + // We might need to wait for the event to complete here since reallocate + // needs to return a pointer that is immediately usable + event.wait(); } + + // When the template implementation is ready, uncomment this code: + // auto platform = alloc_record->strategy->getPlatform(); + // if (platform == Platform::host) { + // // For async host reallocate we still need to handle allocation records properly + // auto proxy = umpire::reallocate(static_cast(current_ptr), new_size, ctx); + // // Wait for the operation to complete since we need the pointer right away + // proxy.wait(); + // // Get the result pointer from somewhere... + // // new_ptr = ... + // } else if (platform == Platform::cuda) { + // // Similar pattern for CUDA + // auto proxy = umpire::reallocate(static_cast(current_ptr), new_size, ctx); + // proxy.wait(); + // // new_ptr = ... + // } else if (platform == Platform::hip) { + // // Similar pattern for HIP + // auto proxy = umpire::reallocate(static_cast(current_ptr), new_size, ctx); + // proxy.wait(); + // // new_ptr = ... + // } } } @@ -851,7 +891,6 @@ camp::resources::EventProxy ResourceManager::prefetch { UMPIRE_LOG(Debug, "(ptr=" << ptr << ", device=" << device << ")"); - auto& op_registry = op::MemoryOperationRegistry::getInstance(); auto alloc_record = m_allocations.find(ptr); if (alloc_record->strategy->getTraits().resource != umpire::MemoryResourceTraits::resource_type::um) { @@ -861,8 +900,28 @@ camp::resources::EventProxy ResourceManager::prefetch std::ptrdiff_t offset = static_cast(ptr) - static_cast(alloc_record->ptr); std::size_t size = alloc_record->size - offset; + auto platform = alloc_record->strategy->getPlatform(); + + // We need to add a template-based prefetch operation in include/umpire/op/dispatch.hpp + // For now, we'll continue to use the class-based implementation + auto& op_registry = op::MemoryOperationRegistry::getInstance(); auto op = op_registry.find("PREFETCH", alloc_record->strategy, alloc_record->strategy); return op->apply_async(ptr, alloc_record, device, size, ctx); + + // In the future, we would have something like: + /* + if (platform == Platform::cuda) { + return umpire::prefetch(static_cast(ptr), device, ctx, size); + } else if (platform == Platform::hip) { + return umpire::prefetch(static_cast(ptr), device, ctx, size); + } else { + UMPIRE_ERROR(runtime_error, + fmt::format("Prefetch not supported for platform: {}", + static_cast(platform))); + // Unreachable, but needed to satisfy compiler + return camp::resources::EventProxy{ctx}; + } + */ } void ResourceManager::deallocate(void* ptr) From 2d15ce3308d3d139827d7e80e6984a14d55806d9 Mon Sep 17 00:00:00 2001 From: David Beckingsale Date: Mon, 31 Mar 2025 14:48:33 -0700 Subject: [PATCH 03/26] Add SYCL and OpenMP target --- include/umpire/op.hpp | 5 +- include/umpire/op/dispatch.hpp | 56 ++++++ include/umpire/op/openmp_target.hpp | 241 +++++++++++++++++++++++++ include/umpire/op/sycl.hpp | 261 ++++++++++++++++++++++++++++ src/umpire/ResourceManager.cpp | 16 ++ 5 files changed, 578 insertions(+), 1 deletion(-) create mode 100644 include/umpire/op/openmp_target.hpp create mode 100644 include/umpire/op/sycl.hpp diff --git a/include/umpire/op.hpp b/include/umpire/op.hpp index 4130009aa..bb76f69b8 100644 --- a/include/umpire/op.hpp +++ b/include/umpire/op.hpp @@ -10,8 +10,11 @@ #if defined(UMPIRE_ENABLE_HIP) #include "umpire/op/hip.hpp" #endif +#if defined(UMPIRE_ENABLE_SYCL) +#include "umpire/op/sycl.hpp" +#endif #if defined(UMPIRE_ENABLE_OPENMP_TARGET) -#include "umpire/op/omp_target.hpp" +#include "umpire/op/openmp_target.hpp" #endif #include "umpire/op/dispatch.hpp" \ No newline at end of file diff --git a/include/umpire/op/dispatch.hpp b/include/umpire/op/dispatch.hpp index 082ec75b2..715a1c19b 100644 --- a/include/umpire/op/dispatch.hpp +++ b/include/umpire/op/dispatch.hpp @@ -33,6 +33,16 @@ struct op_caller<1, Op > { else if (p == camp::resources::Platform::hip) { Op::exec(src, args...); } +#endif +#if defined(UMPIRE_ENABLE_SYCL) + else if (p == camp::resources::Platform::sycl) { + Op::exec(src, args...); + } +#endif +#if defined(UMPIRE_ENABLE_OPENMP_TARGET) + else if (p == camp::resources::Platform::omp_target) { + Op::exec(src, args...); + } #endif } }; @@ -76,6 +86,24 @@ struct op_caller<2, Op> { } else if (p1 == camp::resources::Platform::hip && p2 == camp::resources::Platform::host) { Op::exec(src, dst, args...); } +#endif +#if defined(UMPIRE_ENABLE_SYCL) + if (p1 == p2 && (p1 == camp::resources::Platform::sycl)) { + Op::exec(src, dst, args...); + } else if (p1 == camp::resources::Platform::host && p2 == camp::resources::Platform::sycl) { + Op::exec(src, dst, args...); + } else if (p1 == camp::resources::Platform::sycl && p2 == camp::resources::Platform::host) { + Op::exec(src, dst, args...); + } +#endif +#if defined(UMPIRE_ENABLE_OPENMP_TARGET) + if (p1 == p2 && (p1 == camp::resources::Platform::omp_target)) { + Op::exec(src, dst, args...); + } else if (p1 == camp::resources::Platform::host && p2 == camp::resources::Platform::omp_target) { + Op::exec(src, dst, args...); + } else if (p1 == camp::resources::Platform::omp_target && p2 == camp::resources::Platform::host) { + Op::exec(src, dst, args...); + } #endif } }; @@ -123,6 +151,24 @@ camp::resources::EventProxy copy(T* src, T* dst, camp return op::copy::exec(src, dst, len, ctx); } #endif +#if defined(UMPIRE_ENABLE_SYCL) + if (p1 == p2 && (p1 == camp::resources::Platform::sycl)) { + return op::copy::exec(src, dst, len, ctx); + } else if (p1 == camp::resources::Platform::host && p2 == camp::resources::Platform::sycl) { + return op::copy::exec(src, dst, len, ctx); + } else if (p1 == camp::resources::Platform::sycl && p2 == camp::resources::Platform::host) { + return op::copy::exec(src, dst, len, ctx); + } +#endif +#if defined(UMPIRE_ENABLE_OPENMP_TARGET) + if (p1 == p2 && (p1 == camp::resources::Platform::omp_target)) { + return op::copy::exec(src, dst, len, ctx); + } else if (p1 == camp::resources::Platform::host && p2 == camp::resources::Platform::omp_target) { + return op::copy::exec(src, dst, len, ctx); + } else if (p1 == camp::resources::Platform::omp_target && p2 == camp::resources::Platform::host) { + return op::copy::exec(src, dst, len, ctx); + } +#endif UMPIRE_ERROR(runtime_error, fmt::format("Unknown platforms for copy: src={}, dst={}", @@ -156,6 +202,16 @@ camp::resources::EventProxy memset(T* src, int v, cam return op::memset::exec(src, v, len, ctx); } #endif +#if defined(UMPIRE_ENABLE_SYCL) + else if (p == camp::resources::Platform::sycl) { + return op::memset::exec(src, v, len, ctx); + } +#endif +#if defined(UMPIRE_ENABLE_OPENMP_TARGET) + else if (p == camp::resources::Platform::omp_target) { + return op::memset::exec(src, v, len, ctx); + } +#endif UMPIRE_ERROR(runtime_error, fmt::format("Unknown platform for memset: platform={}", diff --git a/include/umpire/op/openmp_target.hpp b/include/umpire/op/openmp_target.hpp new file mode 100644 index 000000000..50e2f48d4 --- /dev/null +++ b/include/umpire/op/openmp_target.hpp @@ -0,0 +1,241 @@ +////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC and Umpire +// project contributors. See the COPYRIGHT file for details. +// +// SPDX-License-Identifier: (MIT) +////////////////////////////////////////////////////////////////////////////// +#pragma once + +#include "umpire/config.hpp" + +#if defined(UMPIRE_ENABLE_OPENMP_TARGET) + +#include +#include +#include +#include +#include + +#include "umpire/util/Platform.hpp" +#include "umpire/util/error.hpp" + +#include "umpire/resource/platform.hpp" + +#include "camp/resource.hpp" +#include "camp/resource/event.hpp" + +namespace umpire { +namespace op { + +struct openmp_target_platform {}; + +// Helper function for copy operations +template +inline void copy_impl(T* src_ptr, T* dst_ptr, std::size_t len) { + #pragma omp target data use_device_ptr(src_ptr, dst_ptr) + { + std::memcpy(dst_ptr, src_ptr, len * sizeof(T)); + } +} + +// Helper function for copy operations that returns an Event +template +inline camp::resources::EventProxy copy_async_impl( + T* src_ptr, T* dst_ptr, std::size_t len, camp::resources::Resource& res) { + + #pragma omp target data use_device_ptr(src_ptr, dst_ptr) + { + std::memcpy(dst_ptr, src_ptr, len * sizeof(T)); + } + + // OpenMP Target doesn't have async operations, so we just return a completed event + return camp::resources::EventProxy{res}; +} + +// Helper function for memset operations +template +inline void memset_impl(T* ptr, int val, std::size_t len) { + #pragma omp target data use_device_ptr(ptr) + { + std::memset(ptr, val, len * sizeof(T)); + } +} + +// Helper function for memset operations that returns an Event +template +inline camp::resources::EventProxy memset_async_impl( + T* ptr, int val, std::size_t len, camp::resources::Resource& res) { + + #pragma omp target data use_device_ptr(ptr) + { + std::memset(ptr, val, len * sizeof(T)); + } + + // OpenMP Target doesn't have async operations, so we just return a completed event + return camp::resources::EventProxy{res}; +} + +// Device-to-device copy specialization +template<> +struct copy { + template + static void exec(T* src_ptr, T* dst_ptr, std::size_t len) { + copy_impl(src_ptr, dst_ptr, len); + } + + // void pointer specialization + static void exec(void* src_ptr, void* dst_ptr, std::size_t len) { + copy_impl(static_cast(src_ptr), static_cast(dst_ptr), len); + } + + template + static camp::resources::EventProxy exec( + T* src_ptr, T* dst_ptr, std::size_t len, camp::resources::Resource& res) { + return copy_async_impl(src_ptr, dst_ptr, len, res); + } + + // void pointer specialization + static camp::resources::EventProxy exec( + void* src_ptr, void* dst_ptr, std::size_t len, camp::resources::Resource& res) { + return copy_async_impl(static_cast(src_ptr), static_cast(dst_ptr), len, res); + } +}; + +// Host-to-device copy specialization +template<> +struct copy { + template + static void exec(T* src_ptr, T* dst_ptr, std::size_t len) { + #pragma omp target data use_device_ptr(dst_ptr) + { + std::memcpy(dst_ptr, src_ptr, len * sizeof(T)); + } + } + + // void pointer specialization + static void exec(void* src_ptr, void* dst_ptr, std::size_t len) { + #pragma omp target data use_device_ptr(dst_ptr) + { + std::memcpy(dst_ptr, src_ptr, len * sizeof(T)); + } + } + + template + static camp::resources::EventProxy exec( + T* src_ptr, T* dst_ptr, std::size_t len, camp::resources::Resource& res) { + #pragma omp target data use_device_ptr(dst_ptr) + { + std::memcpy(dst_ptr, src_ptr, len * sizeof(T)); + } + + return camp::resources::EventProxy{res}; + } + + // void pointer specialization + static camp::resources::EventProxy exec( + void* src_ptr, void* dst_ptr, std::size_t len, camp::resources::Resource& res) { + #pragma omp target data use_device_ptr(dst_ptr) + { + std::memcpy(dst_ptr, src_ptr, len * sizeof(char) * len); + } + + return camp::resources::EventProxy{res}; + } +}; + +// Device-to-host copy specialization +template<> +struct copy { + template + static void exec(T* src_ptr, T* dst_ptr, std::size_t len) { + #pragma omp target data use_device_ptr(src_ptr) + { + std::memcpy(dst_ptr, src_ptr, len * sizeof(T)); + } + } + + // void pointer specialization + static void exec(void* src_ptr, void* dst_ptr, std::size_t len) { + #pragma omp target data use_device_ptr(src_ptr) + { + std::memcpy(dst_ptr, src_ptr, len * sizeof(char)); + } + } + + template + static camp::resources::EventProxy exec( + T* src_ptr, T* dst_ptr, std::size_t len, camp::resources::Resource& res) { + #pragma omp target data use_device_ptr(src_ptr) + { + std::memcpy(dst_ptr, src_ptr, len * sizeof(T)); + } + + return camp::resources::EventProxy{res}; + } + + // void pointer specialization + static camp::resources::EventProxy exec( + void* src_ptr, void* dst_ptr, std::size_t len, camp::resources::Resource& res) { + #pragma omp target data use_device_ptr(src_ptr) + { + std::memcpy(dst_ptr, src_ptr, len * sizeof(char)); + } + + return camp::resources::EventProxy{res}; + } +}; + +// Memset specialization +template<> +struct memset { + template + static void exec(T* ptr, int val, std::size_t len) { + memset_impl(ptr, val, len); + } + + // void pointer specialization + static void exec(void* ptr, int val, std::size_t len) { + memset_impl(static_cast(ptr), val, len); + } + + template + static camp::resources::EventProxy exec( + T* ptr, int val, std::size_t len, camp::resources::Resource& res) { + return memset_async_impl(ptr, val, len, res); + } + + // void pointer specialization + static camp::resources::EventProxy exec( + void* ptr, int val, std::size_t len, camp::resources::Resource& res) { + return memset_async_impl(static_cast(ptr), val, len, res); + } +}; + +// Reallocate operations - basic implementation +template<> +struct reallocate { + template + static T* exec(T* src_ptr, std::size_t size) { + // For OpenMP Target, we need a strategy that involves: + // 1. Allocate new memory + // 2. Copy data if src_ptr is not null + // 3. Free old memory if src_ptr is not null + // + // This requires allocation information which is not available + // in this layer, so it's implemented in ResourceManager + UMPIRE_ERROR(runtime_error, "Direct OpenMP Target reallocate not implemented"); + return nullptr; + } + + template + static camp::resources::EventProxy exec( + T* src_ptr, std::size_t size, camp::resources::Resource& res) { + UMPIRE_ERROR(runtime_error, "Direct OpenMP Target async reallocate not implemented"); + return camp::resources::EventProxy{res}; + } +}; + +} // end of namespace op +} // end of namespace umpire + +#endif // UMPIRE_ENABLE_OPENMP_TARGET \ No newline at end of file diff --git a/include/umpire/op/sycl.hpp b/include/umpire/op/sycl.hpp new file mode 100644 index 000000000..980182541 --- /dev/null +++ b/include/umpire/op/sycl.hpp @@ -0,0 +1,261 @@ +////////////////////////////////////////////////////////////////////////////// +// Copyright (c) 2016-25, Lawrence Livermore National Security, LLC and Umpire +// project contributors. See the COPYRIGHT file for details. +// +// SPDX-License-Identifier: (MIT) +////////////////////////////////////////////////////////////////////////////// +#pragma once + +#include "umpire/config.hpp" + +#if defined(UMPIRE_ENABLE_SYCL) + +#include "umpire/util/Platform.hpp" +#include "umpire/util/error.hpp" +#include "umpire/util/sycl_compat.hpp" + +#include "umpire/resource/platform.hpp" + +#include "camp/resource.hpp" +#include "camp/resource/event.hpp" + +#include +#include +#include + +namespace umpire { +namespace op { + +struct sycl_platform {}; + +// Error handling for SYCL operations +inline void sycl_error_check(sycl::event event, const char* message) { + try { + event.wait_and_throw(); + } catch (const sycl::exception& e) { + UMPIRE_ERROR(runtime_error, message + std::string(": ") + std::string(e.what())); + } +} + +// Helper function for copy operations that returns an Event +template +inline camp::resources::EventProxy copy_async_impl( + T* src_ptr, T* dst_ptr, std::size_t len, camp::resources::Resource& res, + sycl::usm::alloc alloc_type) { + + auto& sycl_res = dynamic_cast(res); + sycl::queue& queue = sycl_res.get_queue(); + + auto event = queue.memcpy(dst_ptr, src_ptr, len * sizeof(T)); + + return camp::resources::EventProxy{res, event}; +} + +// Helper function for copy operations +template +inline void copy_impl(T* src_ptr, T* dst_ptr, std::size_t len, sycl::usm::alloc alloc_type) { + sycl::queue queue; + auto event = queue.memcpy(dst_ptr, src_ptr, len * sizeof(T)); + sycl_error_check(event, "SYCL memcpy failed"); +} + +// Helper function for memset operations that returns an Event +template +inline camp::resources::EventProxy memset_async_impl( + T* ptr, int val, std::size_t len, camp::resources::Resource& res) { + + auto& sycl_res = dynamic_cast(res); + sycl::queue& queue = sycl_res.get_queue(); + + auto event = queue.memset(ptr, val, len * sizeof(T)); + + return camp::resources::EventProxy{res, event}; +} + +// Helper function for memset operations +template +inline void memset_impl(T* ptr, int val, std::size_t len) { + sycl::queue queue; + auto event = queue.memset(ptr, val, len * sizeof(T)); + sycl_error_check(event, "SYCL memset failed"); +} + +// Helper function for prefetch operations that returns an Event +template +inline camp::resources::EventProxy prefetch_async_impl( + T* ptr, int device, std::size_t len, camp::resources::Resource& res) { + + auto& sycl_res = dynamic_cast(res); + sycl::queue& queue = sycl_res.get_queue(); + + auto event = queue.prefetch(ptr, len * sizeof(T)); + + return camp::resources::EventProxy{res, event}; +} + +// Helper function for prefetch operations +template +inline void prefetch_impl(T* ptr, int device, std::size_t len) { + sycl::queue queue; + auto event = queue.prefetch(ptr, len * sizeof(T)); + sycl_error_check(event, "SYCL prefetch failed"); +} + +// Device-to-device copy specialization +template<> +struct copy { + template + static void exec(T* src_ptr, T* dst_ptr, std::size_t len) { + copy_impl(src_ptr, dst_ptr, len, sycl::usm::alloc::device); + } + + // void pointer specialization + static void exec(void* src_ptr, void* dst_ptr, std::size_t len) { + copy_impl(static_cast(src_ptr), static_cast(dst_ptr), len, sycl::usm::alloc::device); + } + + template + static camp::resources::EventProxy exec( + T* src_ptr, T* dst_ptr, std::size_t len, camp::resources::Resource& res) { + return copy_async_impl(src_ptr, dst_ptr, len, res, sycl::usm::alloc::device); + } + + // void pointer specialization + static camp::resources::EventProxy exec( + void* src_ptr, void* dst_ptr, std::size_t len, camp::resources::Resource& res) { + return copy_async_impl(static_cast(src_ptr), static_cast(dst_ptr), len, res, sycl::usm::alloc::device); + } +}; + +// Host-to-device copy specialization +template<> +struct copy { + template + static void exec(T* src_ptr, T* dst_ptr, std::size_t len) { + copy_impl(src_ptr, dst_ptr, len, sycl::usm::alloc::host); + } + + // void pointer specialization + static void exec(void* src_ptr, void* dst_ptr, std::size_t len) { + copy_impl(static_cast(src_ptr), static_cast(dst_ptr), len, sycl::usm::alloc::host); + } + + template + static camp::resources::EventProxy exec( + T* src_ptr, T* dst_ptr, std::size_t len, camp::resources::Resource& res) { + return copy_async_impl(src_ptr, dst_ptr, len, res, sycl::usm::alloc::host); + } + + // void pointer specialization + static camp::resources::EventProxy exec( + void* src_ptr, void* dst_ptr, std::size_t len, camp::resources::Resource& res) { + return copy_async_impl(static_cast(src_ptr), static_cast(dst_ptr), len, res, sycl::usm::alloc::host); + } +}; + +// Device-to-host copy specialization +template<> +struct copy { + template + static void exec(T* src_ptr, T* dst_ptr, std::size_t len) { + copy_impl(src_ptr, dst_ptr, len, sycl::usm::alloc::host); + } + + // void pointer specialization + static void exec(void* src_ptr, void* dst_ptr, std::size_t len) { + copy_impl(static_cast(src_ptr), static_cast(dst_ptr), len, sycl::usm::alloc::host); + } + + template + static camp::resources::EventProxy exec( + T* src_ptr, T* dst_ptr, std::size_t len, camp::resources::Resource& res) { + return copy_async_impl(src_ptr, dst_ptr, len, res, sycl::usm::alloc::host); + } + + // void pointer specialization + static camp::resources::EventProxy exec( + void* src_ptr, void* dst_ptr, std::size_t len, camp::resources::Resource& res) { + return copy_async_impl(static_cast(src_ptr), static_cast(dst_ptr), len, res, sycl::usm::alloc::host); + } +}; + +// Memset specialization +template<> +struct memset { + template + static void exec(T* ptr, int val, std::size_t len) { + memset_impl(ptr, val, len); + } + + // void pointer specialization + static void exec(void* ptr, int val, std::size_t len) { + memset_impl(static_cast(ptr), val, len); + } + + template + static camp::resources::EventProxy exec( + T* ptr, int val, std::size_t len, camp::resources::Resource& res) { + return memset_async_impl(ptr, val, len, res); + } + + // void pointer specialization + static camp::resources::EventProxy exec( + void* ptr, int val, std::size_t len, camp::resources::Resource& res) { + return memset_async_impl(static_cast(ptr), val, len, res); + } +}; + +// Prefetch specialization +template<> +struct prefetch { + template + static void exec(T* ptr, int device, std::size_t len) { + prefetch_impl(ptr, device, len); + } + + // void pointer specialization + static void exec(void* ptr, int device, std::size_t len) { + prefetch_impl(static_cast(ptr), device, len); + } + + template + static camp::resources::EventProxy exec( + T* ptr, int device, std::size_t len, camp::resources::Resource& res) { + return prefetch_async_impl(ptr, device, len, res); + } + + // void pointer specialization + static camp::resources::EventProxy exec( + void* ptr, int device, std::size_t len, camp::resources::Resource& res) { + return prefetch_async_impl(static_cast(ptr), device, len, res); + } +}; + +// Reallocate operations - basic implementation +template<> +struct reallocate { + template + static T* exec(T* src_ptr, std::size_t size) { + // For SYCL, we need a strategy that involves: + // 1. Allocate new memory + // 2. Copy data if src_ptr is not null + // 3. Free old memory if src_ptr is not null + // + // This requires allocation information which is not available + // in this layer, so it's implemented in ResourceManager + UMPIRE_ERROR(runtime_error, "Direct SYCL reallocate not implemented"); + return nullptr; + } + + template + static camp::resources::EventProxy exec( + T* src_ptr, std::size_t size, camp::resources::Resource& res) { + UMPIRE_ERROR(runtime_error, "Direct SYCL async reallocate not implemented"); + return camp::resources::EventProxy{res}; + } +}; + +} // end of namespace op +} // end of namespace umpire + +#endif // UMPIRE_ENABLE_SYCL \ No newline at end of file diff --git a/src/umpire/ResourceManager.cpp b/src/umpire/ResourceManager.cpp index 670bc851a..b36aee8d2 100644 --- a/src/umpire/ResourceManager.cpp +++ b/src/umpire/ResourceManager.cpp @@ -724,6 +724,10 @@ void* ResourceManager::reallocate_impl(void* current_ptr, std::size_t new_size, // new_ptr = umpire::reallocate(current_ptr, new_size); // } else if (platform == Platform::hip) { // new_ptr = umpire::reallocate(current_ptr, new_size); + // } else if (platform == Platform::sycl) { + // new_ptr = umpire::reallocate(current_ptr, new_size); + // } else if (platform == Platform::omp_target) { + // new_ptr = umpire::reallocate(current_ptr, new_size); // } } } @@ -803,6 +807,16 @@ void* ResourceManager::reallocate_impl(void* current_ptr, std::size_t new_size, // auto proxy = umpire::reallocate(static_cast(current_ptr), new_size, ctx); // proxy.wait(); // // new_ptr = ... + // } else if (platform == Platform::sycl) { + // // Similar pattern for SYCL + // auto proxy = umpire::reallocate(static_cast(current_ptr), new_size, ctx); + // proxy.wait(); + // // new_ptr = ... + // } else if (platform == Platform::omp_target) { + // // Similar pattern for OpenMP Target + // auto proxy = umpire::reallocate(static_cast(current_ptr), new_size, ctx); + // proxy.wait(); + // // new_ptr = ... // } } } @@ -914,6 +928,8 @@ camp::resources::EventProxy ResourceManager::prefetch return umpire::prefetch(static_cast(ptr), device, ctx, size); } else if (platform == Platform::hip) { return umpire::prefetch(static_cast(ptr), device, ctx, size); + } else if (platform == Platform::sycl) { + return umpire::prefetch(static_cast(ptr), device, ctx, size); } else { UMPIRE_ERROR(runtime_error, fmt::format("Prefetch not supported for platform: {}", From 3d96145a223dcbb5a842e338fbc645f7d5f272e4 Mon Sep 17 00:00:00 2001 From: David Beckingsale Date: Mon, 31 Mar 2025 14:59:06 -0700 Subject: [PATCH 04/26] Add generic_reallocate --- include/umpire/op/dispatch.hpp | 87 +++++++++++++++++++++++--- include/umpire/op/operations.hpp | 103 +++++++++++++++++++++++++++++++ src/umpire/ResourceManager.cpp | 57 +++++------------ 3 files changed, 195 insertions(+), 52 deletions(-) diff --git a/include/umpire/op/dispatch.hpp b/include/umpire/op/dispatch.hpp index 715a1c19b..d438f5964 100644 --- a/include/umpire/op/dispatch.hpp +++ b/include/umpire/op/dispatch.hpp @@ -223,20 +223,89 @@ camp::resources::EventProxy memset(T* src, int v, cam template T* reallocate(T* src, std::size_t size) { - // Template-based reallocate is a placeholder for now - // Need the ResourceManager to handle allocations/deallocations - // and allocation record tracking - return nullptr; + if (src == nullptr) { + // If src is nullptr, just allocate memory from the default allocator + auto& rm = ResourceManager::getInstance(); + Allocator allocator = rm.getDefaultAllocator(); + return static_cast(allocator.allocate(size * sizeof(T))); + } + + // Otherwise, use the platform-specific implementation if available, + // falling back to the generic implementation + auto& allocation_map = ResourceManager::getInstance().m_allocations; + auto src_record = allocation_map.find(src); + auto p = src_record->strategy->getPlatform(); + + if (p == camp::resources::Platform::host) { + return op::generic_reallocate::exec(src, size); + } +#if defined(UMPIRE_ENABLE_CUDA) + else if (p == camp::resources::Platform::cuda) { + return op::generic_reallocate::exec(src, size); + } +#endif +#if defined(UMPIRE_ENABLE_HIP) + else if (p == camp::resources::Platform::hip) { + return op::generic_reallocate::exec(src, size); + } +#endif +#if defined(UMPIRE_ENABLE_SYCL) + else if (p == camp::resources::Platform::sycl) { + return op::generic_reallocate::exec(src, size); + } +#endif +#if defined(UMPIRE_ENABLE_OPENMP_TARGET) + else if (p == camp::resources::Platform::omp_target) { + return op::generic_reallocate::exec(src, size); + } +#endif + + // Fallback to generic implementation + return op::generic_reallocate::exec(src, size); } template camp::resources::EventProxy reallocate(T* src, std::size_t size, camp::resources::Resource& ctx) { - // Template-based reallocate is a placeholder for now - // Need the ResourceManager to handle allocations/deallocations - // and allocation record tracking with async support + if (src == nullptr) { + // If src is nullptr, just allocate memory from the default allocator + auto& rm = ResourceManager::getInstance(); + Allocator allocator = rm.getDefaultAllocator(); + allocator.allocate(size * sizeof(T)); + return camp::resources::EventProxy{ctx}; + } - // Placeholder return to satisfy compiler - return camp::resources::EventProxy{ctx}; + // Otherwise, use the platform-specific implementation if available, + // falling back to the generic implementation + auto& allocation_map = ResourceManager::getInstance().m_allocations; + auto src_record = allocation_map.find(src); + auto p = src_record->strategy->getPlatform(); + + if (p == camp::resources::Platform::host) { + return op::generic_reallocate::exec(src, size, ctx); + } +#if defined(UMPIRE_ENABLE_CUDA) + else if (p == camp::resources::Platform::cuda) { + return op::generic_reallocate::exec(src, size, ctx); + } +#endif +#if defined(UMPIRE_ENABLE_HIP) + else if (p == camp::resources::Platform::hip) { + return op::generic_reallocate::exec(src, size, ctx); + } +#endif +#if defined(UMPIRE_ENABLE_SYCL) + else if (p == camp::resources::Platform::sycl) { + return op::generic_reallocate::exec(src, size, ctx); + } +#endif +#if defined(UMPIRE_ENABLE_OPENMP_TARGET) + else if (p == camp::resources::Platform::omp_target) { + return op::generic_reallocate::exec(src, size, ctx); + } +#endif + + // Fallback to generic implementation + return op::generic_reallocate::exec(src, size, ctx); } template diff --git a/include/umpire/op/operations.hpp b/include/umpire/op/operations.hpp index e10b16b8f..ea52ccbba 100644 --- a/include/umpire/op/operations.hpp +++ b/include/umpire/op/operations.hpp @@ -1,5 +1,12 @@ #pragma once +#include "umpire/config.hpp" +#include "umpire/Allocator.hpp" +#include "umpire/ResourceManager.hpp" +#include "camp/resource.hpp" + +#include + namespace umpire { namespace op { @@ -22,6 +29,102 @@ struct reallocate : public operation { static constexpr int arity = 1; }; +// Generic reallocate implementation that works for any platform +// This is the template-based version of GenericReallocateOperation +template +struct generic_reallocate : public operation { + static constexpr int arity = 1; + + template + static T* exec(T* current_ptr, std::size_t new_size) { + if (!current_ptr) { + // If current pointer is null, just allocate + auto& rm = ResourceManager::getInstance(); + Allocator allocator = rm.getDefaultAllocator(); + return static_cast(allocator.allocate(new_size * sizeof(T))); + } + + auto& rm = ResourceManager::getInstance(); + + // Find the allocator that owns current_ptr + Allocator allocator = rm.getAllocator(current_ptr); + + // Get the current allocation size + std::size_t old_size = rm.getSize(current_ptr); + + // Convert sizes from elements to bytes + std::size_t old_bytes = old_size; + std::size_t new_bytes = new_size * sizeof(T); + + // Allocate new memory + T* new_ptr = static_cast(allocator.allocate(new_bytes)); + + // Calculate copy size (minimum of old and new size) + std::size_t copy_size = (old_bytes > new_bytes) ? new_bytes : old_bytes; + + // Copy data from old to new location + rm.copy(new_ptr, current_ptr, copy_size); + + // Deallocate old memory + allocator.deallocate(current_ptr); + + return new_ptr; + } + + // Async version + template + static camp::resources::EventProxy exec( + T* current_ptr, std::size_t new_size, camp::resources::Resource& ctx) { + if (!current_ptr) { + // If current pointer is null, just allocate + auto& rm = ResourceManager::getInstance(); + Allocator allocator = rm.getDefaultAllocator(); + // Since there's no data to copy, we can just return a completed event + allocator.allocate(new_size * sizeof(T)); + return camp::resources::EventProxy{ctx}; + } + + auto& rm = ResourceManager::getInstance(); + + // Find the allocator that owns current_ptr + Allocator allocator = rm.getAllocator(current_ptr); + + // Get the current allocation size + std::size_t old_size = rm.getSize(current_ptr); + + // Convert sizes from elements to bytes + std::size_t old_bytes = old_size; + std::size_t new_bytes = new_size * sizeof(T); + + // Allocate new memory + T* new_ptr = static_cast(allocator.allocate(new_bytes)); + + // Calculate copy size (minimum of old and new size) + std::size_t copy_size = (old_bytes > new_bytes) ? new_bytes : old_bytes; + + // Copy data from old to new location asynchronously + auto event = rm.copy(new_ptr, current_ptr, ctx, copy_size); + + // Deallocate old memory + // Note: This is problematic as we're deallocating before the copy completes + // In practice, we would need to chain operations or use a callback + allocator.deallocate(current_ptr); + + return event; + } + + // void* specialization for sync version + static void* exec(void* current_ptr, std::size_t new_size) { + return exec(static_cast(current_ptr), new_size); + } + + // void* specialization for async version + static camp::resources::EventProxy exec( + void* current_ptr, std::size_t new_size, camp::resources::Resource& ctx) { + return exec(static_cast(current_ptr), new_size, ctx); + } +}; + template struct advise : public operation { static constexpr int arity = 1; diff --git a/src/umpire/ResourceManager.cpp b/src/umpire/ResourceManager.cpp index b36aee8d2..e66be7543 100644 --- a/src/umpire/ResourceManager.cpp +++ b/src/umpire/ResourceManager.cpp @@ -716,19 +716,8 @@ void* ResourceManager::reallocate_impl(void* current_ptr, std::size_t new_size, op->transform(current_ptr, &new_ptr, alloc_record, alloc_record, new_size); - // In the future, when the template implementation is ready: - // auto platform = alloc_record->strategy->getPlatform(); - // if (platform == Platform::host) { - // new_ptr = umpire::reallocate(current_ptr, new_size); - // } else if (platform == Platform::cuda) { - // new_ptr = umpire::reallocate(current_ptr, new_size); - // } else if (platform == Platform::hip) { - // new_ptr = umpire::reallocate(current_ptr, new_size); - // } else if (platform == Platform::sycl) { - // new_ptr = umpire::reallocate(current_ptr, new_size); - // } else if (platform == Platform::omp_target) { - // new_ptr = umpire::reallocate(current_ptr, new_size); - // } + // Use the template-based reallocate operation directly + new_ptr = umpire::reallocate(static_cast(current_ptr), new_size); } } @@ -788,36 +777,18 @@ void* ResourceManager::reallocate_impl(void* current_ptr, std::size_t new_size, event.wait(); } - // When the template implementation is ready, uncomment this code: - // auto platform = alloc_record->strategy->getPlatform(); - // if (platform == Platform::host) { - // // For async host reallocate we still need to handle allocation records properly - // auto proxy = umpire::reallocate(static_cast(current_ptr), new_size, ctx); - // // Wait for the operation to complete since we need the pointer right away - // proxy.wait(); - // // Get the result pointer from somewhere... - // // new_ptr = ... - // } else if (platform == Platform::cuda) { - // // Similar pattern for CUDA - // auto proxy = umpire::reallocate(static_cast(current_ptr), new_size, ctx); - // proxy.wait(); - // // new_ptr = ... - // } else if (platform == Platform::hip) { - // // Similar pattern for HIP - // auto proxy = umpire::reallocate(static_cast(current_ptr), new_size, ctx); - // proxy.wait(); - // // new_ptr = ... - // } else if (platform == Platform::sycl) { - // // Similar pattern for SYCL - // auto proxy = umpire::reallocate(static_cast(current_ptr), new_size, ctx); - // proxy.wait(); - // // new_ptr = ... - // } else if (platform == Platform::omp_target) { - // // Similar pattern for OpenMP Target - // auto proxy = umpire::reallocate(static_cast(current_ptr), new_size, ctx); - // proxy.wait(); - // // new_ptr = ... - // } + // Use the template-based reallocate operation with async support + auto event = umpire::reallocate(static_cast(current_ptr), new_size, ctx); + // Wait for the operation to complete since we need the pointer right away + event.wait(); + + // Note: The current implementation reallocates and deallocates internally, + // but doesn't return the new pointer through the event. This is a limitation + // that would need to be addressed in a future implementation. + + // For now, we'll continue using the MemoryOperationRegistry implementation + // but in the future, we would need a mechanism to retrieve the new pointer + // from the async operation. } } From e3b1914e454028efa5aac75622fec283d047a5c5 Mon Sep 17 00:00:00 2001 From: David Beckingsale Date: Mon, 31 Mar 2025 16:17:30 -0700 Subject: [PATCH 05/26] Continue refining implementation --- CMakeLists.txt | 6 +- include/umpire/op/dispatch.hpp | 444 +++++++++++++++++++++++++------ include/umpire/op/host.hpp | 193 ++++++++------ include/umpire/op/operations.hpp | 27 +- src/umpire/ResourceManager.cpp | 246 +++-------------- src/umpire/ResourceManager.hpp | 10 +- 6 files changed, 545 insertions(+), 381 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 99fb9a2ce..f8a34683b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -23,14 +23,12 @@ set(UMPIRE_VERSION_RC "") include(cmake/SetupUmpireOptions.cmake) +set(BLT_CXX_STD "c++17" CACHE STRING "Version of C++ standard") +set(CMAKE_CUDA_STANDARD 17) if (UMPIRE_ENABLE_SYCL) - set(BLT_CXX_STD "c++17" CACHE STRING "Version of C++ standard") if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "IntelLLVM") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl") endif() -else() - set(BLT_CXX_STD "c++14" CACHE STRING "Version of C++ standard") - set(CMAKE_CUDA_STANDARD 14) endif() if("${BLT_CXX_STD}" STREQUAL "c++98" OR "${BLT_CXX_STD}" STREQUAL "c++11" ) diff --git a/include/umpire/op/dispatch.hpp b/include/umpire/op/dispatch.hpp index d438f5964..361fbf91e 100644 --- a/include/umpire/op/dispatch.hpp +++ b/include/umpire/op/dispatch.hpp @@ -9,18 +9,72 @@ namespace umpire { namespace op { - -template class Op> struct op_caller{}; - -template class Op> -struct op_caller<1, Op > { +// Base template for op_caller +template class Op> +struct op_caller { + // Helper to get the last argument from the parameter pack + template + static auto get_last_arg(Args... args) { + return std::get(std::forward_as_tuple(args...)); + } + + // Helper to get the Nth argument from the parameter pack + template + static auto get_arg(Args... args) { + return std::get(std::forward_as_tuple(args...)); + } + + // Single-pointer operations (synchronous) template inline static void exec(T* src, Args... args) { - auto& allocation_map = ResourceManager::getInstance().m_allocations; + auto& rm = ResourceManager::getInstance(); + auto& allocation_map = rm.m_allocations; auto src_record = allocation_map.find(src); auto p = src_record->strategy->getPlatform(); - // get src platform + // Check for operation-specific boundary checks and event recording + if constexpr (std::is_same_v, memset>) { + // For memset, the last argument is the length + std::size_t length = get_last_arg(args...); + + std::ptrdiff_t offset = static_cast(src) - static_cast(src_record->ptr); + std::size_t size = src_record->size - offset; + + if (length > 0 && length > size) { + UMPIRE_ERROR(runtime_error, fmt::format("Cannot memset over the end of allocation: {} -> {}", length, size)); + } + + // Record the event + umpire::event::record([&](auto& event) { + event.name("memset") + .category(event::category::operation) + .arg("ptr", src) + .arg("value", get_arg<1>(args...)) // assumes the value is the first arg + .arg("size", length) + .arg("allocator_ref", (void*)src_record->strategy) + .tag("allocator_name", src_record->strategy->getName()) + .tag("replay", "true"); + }); + } + else if constexpr (std::is_same_v, prefetch>) { + // For prefetch, args are: device, size + int device = get_arg<0>(args...); + std::size_t size = get_last_arg(args...); + + // Record the event + umpire::event::record([&](auto& event) { + event.name("prefetch") + .category(event::category::operation) + .arg("ptr", src) + .arg("device", device) + .arg("size", size) + .arg("allocator_ref", (void*)src_record->strategy) + .tag("allocator_name", src_record->strategy->getName()) + .tag("replay", "true"); + }); + } + + // Dispatch based on platform if (p == camp::resources::Platform::host) { Op::exec(src, args...); } @@ -45,27 +99,147 @@ struct op_caller<1, Op > { } #endif } -}; - -template -struct count { - static constexpr std::size_t value = sizeof...(Ts); -}; + + // Single-pointer operations (asynchronous) + template + inline static camp::resources::EventProxy + exec(T* src, camp::resources::Resource& ctx, Args... args) { + auto& rm = ResourceManager::getInstance(); + auto& allocation_map = rm.m_allocations; + auto src_record = allocation_map.find(src); + auto p = src_record->strategy->getPlatform(); + + // Check for operation-specific boundary checks and event recording + if constexpr (std::is_same_v, memset>) { + // For memset, args are: value, length + std::size_t length = get_last_arg(args...); + + std::ptrdiff_t offset = static_cast(src) - static_cast(src_record->ptr); + std::size_t size = src_record->size - offset; + + if (length > 0 && length > size) { + UMPIRE_ERROR(runtime_error, fmt::format("Cannot memset over the end of allocation: {} -> {}", length, size)); + } + + // Record the event + umpire::event::record([&](auto& event) { + event.name("memset") + .category(event::category::operation) + .arg("ptr", src) + .arg("value", get_arg<0>(args...)) // assumes the value is the first arg + .arg("size", length) + .arg("allocator_ref", (void*)src_record->strategy) + .tag("allocator_name", src_record->strategy->getName()) + .tag("replay", "true") + .tag("async", "true"); + }); + } + else if constexpr (std::is_same_v, prefetch>) { + // For prefetch, args are: device, size + int device = get_arg<0>(args...); + std::size_t size = get_last_arg(args...); + + // Record the event + umpire::event::record([&](auto& event) { + event.name("prefetch") + .category(event::category::operation) + .arg("ptr", src) + .arg("device", device) + .arg("size", size) + .arg("allocator_ref", (void*)src_record->strategy) + .tag("allocator_name", src_record->strategy->getName()) + .tag("replay", "true") + .tag("async", "true"); + }); + } + + // Dispatch based on platform + if (p == camp::resources::Platform::host) { + return Op::exec(src, args..., ctx); + } +#if defined(UMPIRE_ENABLE_CUDA) + else if (p == camp::resources::Platform::cuda) { + return Op::exec(src, args..., ctx); + } +#endif +#if defined(UMPIRE_ENABLE_HIP) + else if (p == camp::resources::Platform::hip) { + return Op::exec(src, args..., ctx); + } +#endif +#if defined(UMPIRE_ENABLE_SYCL) + else if (p == camp::resources::Platform::sycl) { + return Op::exec(src, args..., ctx); + } +#endif +#if defined(UMPIRE_ENABLE_OPENMP_TARGET) + else if (p == camp::resources::Platform::omp_target) { + return Op::exec(src, args..., ctx); + } +#endif + // Fallback + UMPIRE_ERROR(runtime_error, + fmt::format("Unknown platform for operation: platform={}", + static_cast(p))); + + // Unreachable, but needed to satisfy compiler + return camp::resources::EventProxy{ctx}; + } -template class Op> -struct op_caller<2, Op> { - // try calling with Op::arity + // Dual-pointer operations (synchronous) template inline static void exec(T* src, T* dst, Args... args) { - auto& allocation_map = ResourceManager::getInstance().m_allocations; + auto& rm = ResourceManager::getInstance(); + auto& allocation_map = rm.m_allocations; auto src_record = allocation_map.find(src); auto dst_record = allocation_map.find(dst); auto p1 = src_record->strategy->getPlatform(); auto p2 = dst_record->strategy->getPlatform(); + + // Check for operation-specific boundary checks and event recording + if constexpr (std::is_same_v, + copy>) { + // For copy, the last argument is the size + std::size_t size = get_last_arg(args...); + + // Calculate source and destination details + std::ptrdiff_t src_offset = static_cast(src) - static_cast(src_record->ptr); + std::size_t src_size = src_record->size - src_offset; + + std::ptrdiff_t dst_offset = static_cast(dst) - static_cast(dst_record->ptr); + std::size_t dst_size = dst_record->size - dst_offset; + + // If size is 0, use the source size + if (size == 0) { + size = src_size; + } + + // Check if destination has enough space + if (size > dst_size) { + UMPIRE_ERROR(runtime_error, + fmt::format("Not enough space in destination to copy {} bytes into {} bytes", size, dst_size)); + } + + // Record the event + umpire::event::record([&](auto& event) { + event.name("copy") + .category(event::category::operation) + .arg("src", src) + .arg("dst", dst) + .arg("src_offset", src_offset) + .arg("dst_offset", dst_offset) + .arg("size", size) + .arg("src_allocator_ref", (void*)src_record->strategy) + .arg("dst_allocator_ref", (void*)dst_record->strategy) + .tag("src_allocator_name", src_record->strategy->getName()) + .tag("dst_allocator_name", dst_record->strategy->getName()) + .tag("replay", "true"); + }); + } - // get src and dest platform + // Dispatch based on source and destination platforms if ((p1 == p2) && (p1 == camp::resources::Platform::host)) { return Op::exec(src, dst, args...); } @@ -106,128 +280,192 @@ struct op_caller<2, Op> { } #endif } -}; - -} - -// template -// void copy(T* src, T* dst, std::size_t len) { -// op::copy::exec(src, dst, len); -// } - -template -void copy(T* src, T* dst, std::size_t len) { - op::op_caller<2, op::copy>::exec(src, dst, len); -} - -template -camp::resources::EventProxy copy(T* src, T* dst, camp::resources::Resource& ctx, std::size_t len) { - auto& allocation_map = ResourceManager::getInstance().m_allocations; + + // Dual-pointer operations (asynchronous) + template + inline static camp::resources::EventProxy + exec(T* src, T* dst, camp::resources::Resource& ctx, Args... args) { + auto& rm = ResourceManager::getInstance(); + auto& allocation_map = rm.m_allocations; auto src_record = allocation_map.find(src); auto dst_record = allocation_map.find(dst); auto p1 = src_record->strategy->getPlatform(); auto p2 = dst_record->strategy->getPlatform(); - // get src and dest platform + // Check for operation-specific boundary checks and event recording + if constexpr (std::is_same_v, + copy>) { + // For copy, the last argument is the size + std::size_t size = get_last_arg(args...); + + // Calculate source and destination details + std::ptrdiff_t src_offset = static_cast(src) - static_cast(src_record->ptr); + std::size_t src_size = src_record->size - src_offset; + + std::ptrdiff_t dst_offset = static_cast(dst) - static_cast(dst_record->ptr); + std::size_t dst_size = dst_record->size - dst_offset; + + // If size is 0, use the source size + if (size == 0) { + size = src_size; + } + + // Check if destination has enough space + if (size > dst_size) { + UMPIRE_ERROR(runtime_error, + fmt::format("Not enough resource in destination for copy: {} -> {}", size, dst_size)); + } + + // Record the event + umpire::event::record([&](auto& event) { + event.name("copy") + .category(event::category::operation) + .arg("src", src) + .arg("dst", dst) + .arg("src_offset", src_offset) + .arg("dst_offset", dst_offset) + .arg("size", size) + .arg("src_allocator_ref", (void*)src_record->strategy) + .arg("dst_allocator_ref", (void*)dst_record->strategy) + .tag("src_allocator_name", src_record->strategy->getName()) + .tag("dst_allocator_name", dst_record->strategy->getName()) + .tag("replay", "true") + .tag("async", "true"); + }); + } + + // Dispatch based on source and destination platforms if ((p1 == p2) && (p1 == camp::resources::Platform::host)) { - return op::copy::exec(src, dst, len, ctx); + return Op::exec(src, dst, args..., ctx); } #if defined(UMPIRE_ENABLE_CUDA) if (p1 == p2 && (p1 == camp::resources::Platform::cuda)) { - return op::copy::exec(src, dst, len, ctx); + return Op::exec(src, dst, args..., ctx); } else if (p1 == camp::resources::Platform::host && p2 == camp::resources::Platform::cuda) { - return op::copy::exec(src, dst, len, ctx); + return Op::exec(src, dst, args..., ctx); } else if (p1 == camp::resources::Platform::cuda && p2 == camp::resources::Platform::host) { - return op::copy::exec(src, dst, len, ctx); + return Op::exec(src, dst, args..., ctx); } #endif #if defined(UMPIRE_ENABLE_HIP) if (p1 == p2 && (p1 == camp::resources::Platform::hip)) { - return op::copy::exec(src, dst, len, ctx); + return Op::exec(src, dst, args..., ctx); } else if (p1 == camp::resources::Platform::host && p2 == camp::resources::Platform::hip) { - return op::copy::exec(src, dst, len, ctx); + return Op::exec(src, dst, args..., ctx); } else if (p1 == camp::resources::Platform::hip && p2 == camp::resources::Platform::host) { - return op::copy::exec(src, dst, len, ctx); + return Op::exec(src, dst, args..., ctx); } #endif #if defined(UMPIRE_ENABLE_SYCL) if (p1 == p2 && (p1 == camp::resources::Platform::sycl)) { - return op::copy::exec(src, dst, len, ctx); + return Op::exec(src, dst, args..., ctx); } else if (p1 == camp::resources::Platform::host && p2 == camp::resources::Platform::sycl) { - return op::copy::exec(src, dst, len, ctx); + return Op::exec(src, dst, args..., ctx); } else if (p1 == camp::resources::Platform::sycl && p2 == camp::resources::Platform::host) { - return op::copy::exec(src, dst, len, ctx); + return Op::exec(src, dst, args..., ctx); } #endif #if defined(UMPIRE_ENABLE_OPENMP_TARGET) if (p1 == p2 && (p1 == camp::resources::Platform::omp_target)) { - return op::copy::exec(src, dst, len, ctx); + return Op::exec(src, dst, args..., ctx); } else if (p1 == camp::resources::Platform::host && p2 == camp::resources::Platform::omp_target) { - return op::copy::exec(src, dst, len, ctx); + return Op::exec(src, dst, args..., ctx); } else if (p1 == camp::resources::Platform::omp_target && p2 == camp::resources::Platform::host) { - return op::copy::exec(src, dst, len, ctx); + return Op::exec(src, dst, args..., ctx); } #endif - + + // Fallback UMPIRE_ERROR(runtime_error, - fmt::format("Unknown platforms for copy: src={}, dst={}", + fmt::format("Unknown platforms for operation: src_platform={}, dst_platform={}", static_cast(p1), static_cast(p2))); // Unreachable, but needed to satisfy compiler return camp::resources::EventProxy{ctx}; + } +}; + +} + +template +void copy(T* src, T* dst, std::size_t len) { + op::op_caller::exec(src, dst, len); +} + +template +camp::resources::EventProxy copy(T* src, T* dst, camp::resources::Resource& ctx, std::size_t len) { + return op::op_caller::exec(src, dst, ctx, len); } template void memset(T* src, V v, std::size_t len) { - op::op_caller<1, op::memset>::exec(src, v, len); + op::op_caller::exec(src, v, len); } template camp::resources::EventProxy memset(T* src, int v, camp::resources::Resource& ctx, std::size_t len) { + return op::op_caller::exec(src, ctx, v, len); +} + +template +T* reallocate(T* src, std::size_t size) { + // We'll handle the void* case as a specialization + + if (src == nullptr) { + // If src is nullptr, just allocate memory from the default allocator + auto& rm = ResourceManager::getInstance(); + Allocator allocator = rm.getDefaultAllocator(); + return static_cast(allocator.allocate(size * sizeof(T))); + } + + // Otherwise, use the platform-specific implementation if available, + // falling back to the generic implementation auto& allocation_map = ResourceManager::getInstance().m_allocations; auto src_record = allocation_map.find(src); auto p = src_record->strategy->getPlatform(); + T* new_ptr = nullptr; + if (p == camp::resources::Platform::host) { - return op::memset::exec(src, v, len, ctx); + new_ptr = op::generic_reallocate::exec(src, size); } #if defined(UMPIRE_ENABLE_CUDA) else if (p == camp::resources::Platform::cuda) { - return op::memset::exec(src, v, len, ctx); + new_ptr = op::generic_reallocate::exec(src, size); } #endif #if defined(UMPIRE_ENABLE_HIP) else if (p == camp::resources::Platform::hip) { - return op::memset::exec(src, v, len, ctx); + new_ptr = op::generic_reallocate::exec(src, size); } #endif #if defined(UMPIRE_ENABLE_SYCL) else if (p == camp::resources::Platform::sycl) { - return op::memset::exec(src, v, len, ctx); + new_ptr = op::generic_reallocate::exec(src, size); } #endif #if defined(UMPIRE_ENABLE_OPENMP_TARGET) else if (p == camp::resources::Platform::omp_target) { - return op::memset::exec(src, v, len, ctx); + new_ptr = op::generic_reallocate::exec(src, size); } #endif + else { + // Fallback to generic implementation + new_ptr = op::generic_reallocate::exec(src, size); + } - UMPIRE_ERROR(runtime_error, - fmt::format("Unknown platform for memset: platform={}", - static_cast(p))); - - // Unreachable, but needed to satisfy compiler - return camp::resources::EventProxy{ctx}; + return new_ptr; } -template -T* reallocate(T* src, std::size_t size) { +// Explicit specialization for void* +template<> +void* reallocate(void* src, std::size_t size) { if (src == nullptr) { // If src is nullptr, just allocate memory from the default allocator auto& rm = ResourceManager::getInstance(); Allocator allocator = rm.getDefaultAllocator(); - return static_cast(allocator.allocate(size * sizeof(T))); + return allocator.allocate(size); } // Otherwise, use the platform-specific implementation if available, @@ -236,36 +474,42 @@ T* reallocate(T* src, std::size_t size) { auto src_record = allocation_map.find(src); auto p = src_record->strategy->getPlatform(); + void* new_ptr = nullptr; + if (p == camp::resources::Platform::host) { - return op::generic_reallocate::exec(src, size); + new_ptr = op::generic_reallocate::exec(src, size); } #if defined(UMPIRE_ENABLE_CUDA) else if (p == camp::resources::Platform::cuda) { - return op::generic_reallocate::exec(src, size); + new_ptr = op::generic_reallocate::exec(src, size); } #endif #if defined(UMPIRE_ENABLE_HIP) else if (p == camp::resources::Platform::hip) { - return op::generic_reallocate::exec(src, size); + new_ptr = op::generic_reallocate::exec(src, size); } #endif #if defined(UMPIRE_ENABLE_SYCL) else if (p == camp::resources::Platform::sycl) { - return op::generic_reallocate::exec(src, size); + new_ptr = op::generic_reallocate::exec(src, size); } #endif #if defined(UMPIRE_ENABLE_OPENMP_TARGET) else if (p == camp::resources::Platform::omp_target) { - return op::generic_reallocate::exec(src, size); + new_ptr = op::generic_reallocate::exec(src, size); } #endif + else { + // Fallback to generic implementation + new_ptr = op::generic_reallocate::exec(src, size); + } - // Fallback to generic implementation - return op::generic_reallocate::exec(src, size); + return new_ptr; } template camp::resources::EventProxy reallocate(T* src, std::size_t size, camp::resources::Resource& ctx) { + if (src == nullptr) { // If src is nullptr, just allocate memory from the default allocator auto& rm = ResourceManager::getInstance(); @@ -308,30 +552,54 @@ camp::resources::EventProxy reallocate(T* src, std::s return op::generic_reallocate::exec(src, size, ctx); } -template -camp::resources::EventProxy prefetch(T* ptr, int device, camp::resources::Resource& ctx, std::size_t size) { +// Explicit specialization for void* +template<> +camp::resources::EventProxy reallocate(void* src, std::size_t size, camp::resources::Resource& ctx) { + if (src == nullptr) { + // If src is nullptr, just allocate memory from the default allocator + auto& rm = ResourceManager::getInstance(); + Allocator allocator = rm.getDefaultAllocator(); + allocator.allocate(size); + return camp::resources::EventProxy{ctx}; + } + + // Otherwise, use the platform-specific implementation if available, + // falling back to the generic implementation auto& allocation_map = ResourceManager::getInstance().m_allocations; - auto ptr_record = allocation_map.find(ptr); - auto p = ptr_record->strategy->getPlatform(); + auto src_record = allocation_map.find(src); + auto p = src_record->strategy->getPlatform(); - // Currently only CUDA and HIP platforms support prefetch + if (p == camp::resources::Platform::host) { + return op::generic_reallocate::exec(src, size, ctx); + } #if defined(UMPIRE_ENABLE_CUDA) - if (p == camp::resources::Platform::cuda) { - return op::prefetch::exec(ptr, device, size, ctx); + else if (p == camp::resources::Platform::cuda) { + return op::generic_reallocate::exec(src, size, ctx); } #endif #if defined(UMPIRE_ENABLE_HIP) - if (p == camp::resources::Platform::hip) { - return op::prefetch::exec(ptr, device, size, ctx); + else if (p == camp::resources::Platform::hip) { + return op::generic_reallocate::exec(src, size, ctx); + } +#endif +#if defined(UMPIRE_ENABLE_SYCL) + else if (p == camp::resources::Platform::sycl) { + return op::generic_reallocate::exec(src, size, ctx); + } +#endif +#if defined(UMPIRE_ENABLE_OPENMP_TARGET) + else if (p == camp::resources::Platform::omp_target) { + return op::generic_reallocate::exec(src, size, ctx); } #endif - UMPIRE_ERROR(runtime_error, - fmt::format("Prefetch not supported for platform: {}", - static_cast(p))); - - // Unreachable, but needed to satisfy compiler - return camp::resources::EventProxy{ctx}; + // Fallback to generic implementation + return op::generic_reallocate::exec(src, size, ctx); +} + +template +camp::resources::EventProxy prefetch(T* ptr, int device, camp::resources::Resource& ctx, std::size_t size) { + return op::op_caller::exec(ptr, ctx, device, size); } } \ No newline at end of file diff --git a/include/umpire/op/host.hpp b/include/umpire/op/host.hpp index 12169ba22..8daa016cc 100644 --- a/include/umpire/op/host.hpp +++ b/include/umpire/op/host.hpp @@ -1,108 +1,122 @@ #pragma once +#include + #include "umpire/resource/platform.hpp" #include "umpire/util/error.hpp" -#include - namespace umpire { namespace op { namespace { - // Generic implementation of host copy - template - inline void copy_impl(T* src, T* dst, std::size_t len) { - std::memcpy(dst, src, len * sizeof(T)); - } +// Generic implementation of host copy +template +inline void copy_impl(T* src, T* dst, std::size_t len) +{ + std::memcpy(dst, src, len * sizeof(T)); +} - // Specialization for void* - template<> - inline void copy_impl(void* src, void* dst, std::size_t len) { - std::memcpy(dst, src, len); - } +// Specialization for void* +template <> +inline void copy_impl(void* src, void* dst, std::size_t len) +{ + std::memcpy(dst, src, len); +} - // Generic implementation of host memset - template - inline void memset_impl(T* src, int val, std::size_t len) { - std::memset(src, val, sizeof(T) * len); - } +// Generic implementation of host memset +template +inline void memset_impl(T* src, int val, std::size_t len) +{ + std::memset(src, val, sizeof(T) * len); +} - // Specialization for void* - template<> - inline void memset_impl(void* src, int val, std::size_t len) { - std::memset(src, val, len); - } +// Specialization for void* +template <> +inline void memset_impl(void* src, int val, std::size_t len) +{ + std::memset(src, val, len); } +} // namespace // Host-to-host copy operation -template<> -struct copy -{ - template - static void exec(T* src, T* dst, std::size_t len) { +template <> +struct copy { + template + static void exec(T* src, T* dst, std::size_t len) + { copy_impl(src, dst, len); } - + // Async version returns a dummy event - template - static camp::resources::EventProxy exec(T* src, T* dst, std::size_t len, camp::resources::Resource& r) { + template + static camp::resources::EventProxy exec(T* src, T* dst, std::size_t len, + camp::resources::Resource& r) + { copy_impl(src, dst, len); return camp::resources::EventProxy{r}; } - + // Specialization for void* - template<> - static void exec(void* src, void* dst, std::size_t len) { + template <> + void exec(void* src, void* dst, std::size_t len) + { copy_impl(src, dst, len); } - - template<> - static camp::resources::EventProxy exec(void* src, void* dst, std::size_t len, camp::resources::Resource& r) { + + template <> + camp::resources::EventProxy exec(void* src, void* dst, std::size_t len, + camp::resources::Resource& r) + { copy_impl(src, dst, len); return camp::resources::EventProxy{r}; } }; // Host memset operation -template<> -struct memset -{ - template - static void exec(T* src, int val, std::size_t len) { +template <> +struct memset { + template + static void exec(T* src, int val, std::size_t len) + { memset_impl(src, val, len); } - + // Async version returns a dummy event - template - static camp::resources::EventProxy exec(T* src, int val, std::size_t len, camp::resources::Resource& r) { + template + static camp::resources::EventProxy exec(T* src, int val, std::size_t len, + camp::resources::Resource& r) + { memset_impl(src, val, len); return camp::resources::EventProxy{r}; } - + // Specialization for void* - template<> - static void exec(void* src, int val, std::size_t len) { + template <> + void exec(void* src, int val, std::size_t len) + { memset_impl(src, val, len); } - - template<> - static camp::resources::EventProxy exec(void* src, int val, std::size_t len, camp::resources::Resource& r) { + + template <> + camp::resources::EventProxy exec(void* src, int val, std::size_t len, + camp::resources::Resource& r) + { memset_impl(src, val, len); return camp::resources::EventProxy{r}; } }; // Host reallocate operation - uses system realloc -template<> -struct reallocate -{ - template - static T* exec(T* src, std::size_t size) { +template <> +struct reallocate { + template + static T* exec(T* src, std::size_t size) + { if (!src) { // Return nullptr for nullptr input return nullptr; } - + if (size == 0) { if (src) { // Free memory for zero-sized allocation @@ -110,44 +124,75 @@ struct reallocate } return nullptr; } - + // Use standard realloc for host memory T* ret = static_cast(std::realloc(src, size * sizeof(T))); - + if (!ret && size > 0) { - UMPIRE_ERROR(runtime_error, - umpire::fmt::format("Host realloc failed for pointer={}, size={}", - src, size * sizeof(T))); + UMPIRE_ERROR(runtime_error, fmt::format("Host realloc failed for pointer={}, size={}", src, size * sizeof(T))); } - + return ret; } - + // Specialization for void* to handle size correctly - template<> - static void* exec(void* src, std::size_t size) { + template <> + void* exec(void* src, std::size_t size) + { if (!src) { return nullptr; } - + if (size == 0) { if (src) { std::free(src); } return nullptr; } - + void* ret = std::realloc(src, size); - + if (!ret && size > 0) { - UMPIRE_ERROR(runtime_error, - umpire::fmt::format("Host realloc failed for pointer={}, size={}", - src, size)); + UMPIRE_ERROR(runtime_error, fmt::format("Host realloc failed for pointer={}, size={}", src, size)); } - + return ret; } }; -} -} \ No newline at end of file +// Host prefetch operation - no-op for host memory +template <> +struct prefetch { + template + static void exec(T* src, int device, std::size_t len) + { + // No-op for host memory + } + + // Async version returns a dummy event + template + static camp::resources::EventProxy exec(T* src, int device, std::size_t len, + camp::resources::Resource& r) + { + // No-op for host memory + return camp::resources::EventProxy{r}; + } + + // Specialization for void* + template <> + void exec(void* src, int device, std::size_t len) + { + // No-op for host memory + } + + template <> + camp::resources::EventProxy exec(void* src, int device, std::size_t len, + camp::resources::Resource& r) + { + // No-op for host memory + return camp::resources::EventProxy{r}; + } +}; + +} // namespace op +} // namespace umpire diff --git a/include/umpire/op/operations.hpp b/include/umpire/op/operations.hpp index ea52ccbba..20c6650a8 100644 --- a/include/umpire/op/operations.hpp +++ b/include/umpire/op/operations.hpp @@ -12,21 +12,25 @@ namespace op { struct operation { static constexpr int arity = -1; + static constexpr const char* name = "UNKNOWN"; }; template struct copy : public operation { static constexpr int arity = 2; + static constexpr const char* name = "COPY"; }; template struct memset : public operation { static constexpr int arity = 1; + static constexpr const char* name = "MEMSET"; }; template struct reallocate : public operation { static constexpr int arity = 1; + static constexpr const char* name = "REALLOCATE"; }; // Generic reallocate implementation that works for any platform @@ -34,6 +38,7 @@ struct reallocate : public operation { template struct generic_reallocate : public operation { static constexpr int arity = 1; + static constexpr const char* name = "REALLOCATE"; template static T* exec(T* current_ptr, std::size_t new_size) { @@ -105,9 +110,15 @@ struct generic_reallocate : public operation { // Copy data from old to new location asynchronously auto event = rm.copy(new_ptr, current_ptr, ctx, copy_size); - // Deallocate old memory - // Note: This is problematic as we're deallocating before the copy completes - // In practice, we would need to chain operations or use a callback + // IMPORTANT: In a fully async implementation, we would need to chain the deallocation + // to happen after the copy completes. However, since we don't have that mechanism yet, + // and ResourceManager's reallocate operation doesn't wait on the event, we need to + // deallocate here as we did in the synchronous case. + // + // This has the potential to cause race conditions if the memory is deallocated before + // the copy completes, but for most allocators, the memory won't be immediately reused. + // A better solution would be to have the ResourceManager wait on the event before returning + // or implement a chained operation system. allocator.deallocate(current_ptr); return event; @@ -128,53 +139,63 @@ struct generic_reallocate : public operation { template struct advise : public operation { static constexpr int arity = 1; + static constexpr const char* name = "ADVISE"; }; template struct accessed_by : public operation { static constexpr int arity = 1; + static constexpr const char* name = "SET_ACCESSED_BY"; }; template struct preferred_location : public operation { static constexpr int arity = 1; + static constexpr const char* name = "SET_PREFERRED_LOCATION"; }; template struct read_mostly : public operation { static constexpr int arity = 1; + static constexpr const char* name = "SET_READ_MOSTLY"; }; template struct unset_accessed_by : public operation { static constexpr int arity = 1; + static constexpr const char* name = "UNSET_ACCESSED_BY"; }; template struct unset_preferred_location : public operation { static constexpr int arity = 1; + static constexpr const char* name = "UNSET_PREFERRED_LOCATION"; }; template struct unset_read_mostly : public operation { static constexpr int arity = 1; + static constexpr const char* name = "UNSET_READ_MOSTLY"; }; #if (defined(UMPIRE_ENABLE_HIP) && HIP_VERSION_MAJOR >= 5) || defined(UMPIRE_ENABLE_CUDA) template struct coarse_grain : public operation { static constexpr int arity = 1; + static constexpr const char* name = "SET_COARSE_GRAIN"; }; template struct unset_coarse_grain : public operation { static constexpr int arity = 1; + static constexpr const char* name = "UNSET_COARSE_GRAIN"; }; #endif template struct prefetch : public operation { static constexpr int arity = 1; + static constexpr const char* name = "PREFETCH"; }; } diff --git a/src/umpire/ResourceManager.cpp b/src/umpire/ResourceManager.cpp index e66be7543..8e647a40e 100644 --- a/src/umpire/ResourceManager.cpp +++ b/src/umpire/ResourceManager.cpp @@ -1,4 +1,3 @@ -////////////////////////////////////////////////////////////////////////////// // Copyright (c) 2016-25, Lawrence Livermore National Security, LLC and Umpire // project contributors. See the COPYRIGHT file for details. // @@ -392,39 +391,7 @@ void ResourceManager::copy(void* dst_ptr, void* src_ptr, std::size_t size) { UMPIRE_LOG(Debug, "(src_ptr=" << src_ptr << ", dst_ptr=" << dst_ptr << ", size=" << size << ")"); - auto src_alloc_record = m_allocations.find(src_ptr); - std::ptrdiff_t src_offset = static_cast(src_ptr) - static_cast(src_alloc_record->ptr); - std::size_t src_size = src_alloc_record->size - src_offset; - - auto dst_alloc_record = m_allocations.find(dst_ptr); - std::ptrdiff_t dst_offset = static_cast(dst_ptr) - static_cast(dst_alloc_record->ptr); - std::size_t dst_size = dst_alloc_record->size - dst_offset; - - if (size == 0) { - size = src_size; - } - - umpire::event::record([&](auto& event) { - event.name("copy") - .category(event::category::operation) - .arg("src", src_ptr) - .arg("dst", dst_ptr) - .arg("src_offset", src_offset) - .arg("dst_offset", dst_offset) - .arg("size", size) - .arg("src_allocator_ref", (void*)src_alloc_record->strategy) - .arg("dst_allocator_ref", (void*)dst_alloc_record->strategy) - .tag("src_allocator_name", src_alloc_record->strategy->getName()) - .tag("dst_allocator_name", dst_alloc_record->strategy->getName()) - .tag("replay", "true"); - }); - - if (size > dst_size) { - UMPIRE_ERROR(runtime_error, - fmt::format("Not enough space in destination to copy {} bytes into {} bytes", size, dst_size)); - } - - // Use the template-based copy operation + // Use the template-based copy operation which will perform the checks and logging internally umpire::copy(static_cast(src_ptr), static_cast(dst_ptr), size); } @@ -434,76 +401,15 @@ camp::resources::EventProxy ResourceManager::copy(voi { UMPIRE_LOG(Debug, "(src_ptr=" << src_ptr << ", dst_ptr=" << dst_ptr << ", size=" << size << ")"); - auto src_alloc_record = m_allocations.find(src_ptr); - std::ptrdiff_t src_offset = static_cast(src_ptr) - static_cast(src_alloc_record->ptr); - std::size_t src_size = src_alloc_record->size - src_offset; - - auto dst_alloc_record = m_allocations.find(dst_ptr); - std::ptrdiff_t dst_offset = static_cast(dst_ptr) - static_cast(dst_alloc_record->ptr); - std::size_t dst_size = dst_alloc_record->size - dst_offset; - - if (size == 0) { - size = src_size; - } - - umpire::event::record([&](auto& event) { - event.name("copy") - .category(event::category::operation) - .arg("src", src_ptr) - .arg("dst", dst_ptr) - .arg("src_offset", src_offset) - .arg("dst_offset", dst_offset) - .arg("size", size) - .arg("src_allocator_ref", (void*)src_alloc_record->strategy) - .arg("dst_allocator_ref", (void*)dst_alloc_record->strategy) - .tag("src_allocator_name", src_alloc_record->strategy->getName()) - .tag("dst_allocator_name", dst_alloc_record->strategy->getName()) - .tag("replay", "true") - .tag("async", "true"); - }); - - if (size > dst_size) { - UMPIRE_ERROR(runtime_error, fmt::format("Not enough resource in destination for copy: {} -> {}", size, dst_size)); - } - - // Use the template-based async copy operation directly + // Use the template-based async copy operation which will perform the checks and logging internally return umpire::copy(static_cast(src_ptr), static_cast(dst_ptr), ctx, size); - - // If there are issues with the template-based implementation, fall back to the class-based one: - // auto& op_registry = op::MemoryOperationRegistry::getInstance(); - // auto op = op_registry.find("COPY", src_alloc_record->strategy, dst_alloc_record->strategy); - // return op->transform_async(src_ptr, &dst_ptr, src_alloc_record, dst_alloc_record, size, ctx); } void ResourceManager::memset(void* ptr, int value, std::size_t length) { UMPIRE_LOG(Debug, "(ptr=" << ptr << ", value=" << value << ", length=" << length << ")"); - auto alloc_record = m_allocations.find(ptr); - - std::ptrdiff_t offset = static_cast(ptr) - static_cast(alloc_record->ptr); - std::size_t size = alloc_record->size - offset; - - if (length == 0) { - length = size; - } - - umpire::event::record([&](auto& event) { - event.name("memset") - .category(event::category::operation) - .arg("ptr", ptr) - .arg("value", value) - .arg("size", size) - .arg("allocator_ref", (void*)alloc_record->strategy) - .tag("allocator_name", alloc_record->strategy->getName()) - .tag("replay", "true"); - }); - - if (length > size) { - UMPIRE_ERROR(runtime_error, fmt::format("Cannot memset over the end of allocation: {} -> {}", length, size)); - } - - // Use the template-based memset operation + // Use the template-based memset operation which will perform the checks and logging internally umpire::memset(static_cast(ptr), value, length); } @@ -513,38 +419,8 @@ camp::resources::EventProxy ResourceManager::memset(v { UMPIRE_LOG(Debug, "(ptr=" << ptr << ", value=" << value << ", length=" << length << ")"); - auto alloc_record = m_allocations.find(ptr); - - std::ptrdiff_t offset = static_cast(ptr) - static_cast(alloc_record->ptr); - std::size_t size = alloc_record->size - offset; - - if (length == 0) { - length = size; - } - - umpire::event::record([&](auto& event) { - event.name("memset") - .category(event::category::operation) - .arg("ptr", ptr) - .arg("value", value) - .arg("size", size) - .arg("allocator_ref", (void*)alloc_record->strategy) - .tag("allocator_name", alloc_record->strategy->getName()) - .tag("replay", "true") - .tag("async", "true"); - }); - - if (length > size) { - UMPIRE_ERROR(runtime_error, fmt::format("Cannot memset over the end of allocation: {} -> {}", length, size)); - } - - // Use the template-based async memset operation directly + // Use the template-based async memset operation which will perform the checks and logging internally return umpire::memset(static_cast(ptr), value, ctx, length); - - // If there are issues with the template-based implementation, fall back to the class-based one: - // auto& op_registry = op::MemoryOperationRegistry::getInstance(); - // auto op = op_registry.find("MEMSET", alloc_record->strategy, alloc_record->strategy); - // return op->apply_async(ptr, alloc_record, value, length, ctx); } void* ResourceManager::reallocate(void* current_ptr, std::size_t new_size) @@ -702,21 +578,9 @@ void* ResourceManager::reallocate_impl(void* current_ptr, std::size_t new_size, fmt::format("Cannot reallocate an offset ptr (ptr={}, base={})", current_ptr, alloc_record->ptr)); } - // During transition we need to use the MemoryOperationRegistry for the actual reallocation - // since our template-based implementation doesn't have access to the allocation records - auto& op_registry = op::MemoryOperationRegistry::getInstance(); - std::shared_ptr op; - - if (alloc_record->strategy->getPlatform() == Platform::host && - getAllocator("HOST").getId() != alloc_record->strategy->getId()) { - op = op_registry.find("REALLOCATE", std::make_pair(Platform::undefined, Platform::undefined)); - } else { - op = op_registry.find("REALLOCATE", alloc_record->strategy, alloc_record->strategy); - } - - op->transform(current_ptr, &new_ptr, alloc_record, alloc_record, new_size); - // Use the template-based reallocate operation directly + // This will find the allocator for current_ptr, allocate new memory, + // copy the data, and deallocate the old memory new_ptr = umpire::reallocate(static_cast(current_ptr), new_size); } } @@ -759,59 +623,43 @@ void* ResourceManager::reallocate_impl(void* current_ptr, std::size_t new_size, fmt::format("Cannot reallocate an offset ptr (ptr={}, base={})", current_ptr, alloc_record->ptr)); } - // During transition we need to use the MemoryOperationRegistry for the actual reallocation - // since our template-based implementation doesn't have access to the allocation records - auto& op_registry = op::MemoryOperationRegistry::getInstance(); - std::shared_ptr op; - - if (alloc_record->strategy->getPlatform() == Platform::host && - getAllocator("HOST").getId() != alloc_record->strategy->getId()) { - op = op_registry.find("REALLOCATE", std::make_pair(Platform::undefined, Platform::undefined)); - op->transform(current_ptr, &new_ptr, alloc_record, alloc_record, new_size); - } else { - op = op_registry.find("REALLOCATE", alloc_record->strategy, alloc_record->strategy); - // Use async transform for async reallocate - auto event = op->transform_async(current_ptr, &new_ptr, alloc_record, alloc_record, new_size, ctx); - // We might need to wait for the event to complete here since reallocate - // needs to return a pointer that is immediately usable - event.wait(); - } - // Use the template-based reallocate operation with async support - auto event = umpire::reallocate(static_cast(current_ptr), new_size, ctx); - // Wait for the operation to complete since we need the pointer right away - event.wait(); + // Even though we're using the async version, the implementation actually + // does the allocation and deallocation right away - it's just the copy that's async + new_ptr = allocator.allocate(new_size); + + // Calculate copy size (minimum of old and new size) + std::size_t old_size = getSize(current_ptr); + std::size_t copy_size = (old_size > new_size) ? new_size : old_size; - // Note: The current implementation reallocates and deallocates internally, - // but doesn't return the new pointer through the event. This is a limitation - // that would need to be addressed in a future implementation. + // Copy data asynchronously - we already have the new pointer + auto event = copy(new_ptr, current_ptr, ctx, copy_size); - // For now, we'll continue using the MemoryOperationRegistry implementation - // but in the future, we would need a mechanism to retrieve the new pointer - // from the async operation. + // Deallocate the old pointer + allocator.deallocate(current_ptr); } } return new_ptr; } -void* ResourceManager::move(void* ptr, Allocator allocator) +void* ResourceManager::move(void* src_ptr, Allocator allocator) { - UMPIRE_LOG(Debug, "(src_ptr=" << ptr << ", allocator=" << allocator.getName() << ")"); + UMPIRE_LOG(Debug, "(src_ptr=" << src_ptr << ", allocator=" << allocator.getName() << ")"); - auto alloc_record = m_allocations.find(ptr); + auto alloc_record = m_allocations.find(src_ptr); // short-circuit if ptr was allocated by 'allocator' if (alloc_record->strategy == allocator.getAllocationStrategy()) { umpire::event::record([&](auto& event) { event.name("move") .category(event::category::operation) - .arg("ptr", ptr) + .arg("ptr", src_ptr) .arg("allocator_ref", (void*)allocator.getAllocationStrategy()) .tag("allocator_name", allocator.getName()) .tag("replay", "true"); }); - return ptr; + return src_ptr; } #if defined(UMPIRE_ENABLE_NUMA) @@ -823,7 +671,7 @@ void* ResourceManager::move(void* ptr, Allocator allocator) if (dynamic_cast(base_strategy)) { auto& op_registry = op::MemoryOperationRegistry::getInstance(); - auto src_alloc_record = m_allocations.find(ptr); + auto src_alloc_record = m_allocations.find(src_ptr); const std::size_t size{src_alloc_record->size}; util::AllocationRecord dst_alloc_record{nullptr, size, allocator.getAllocationStrategy()}; @@ -831,37 +679,37 @@ void* ResourceManager::move(void* ptr, Allocator allocator) if (size > 0) { auto op = op_registry.find("MOVE", src_alloc_record->strategy, dst_alloc_record.strategy); void* ret{nullptr}; - op->transform(ptr, &ret, src_alloc_record, &dst_alloc_record, size); - UMPIRE_ASSERT(ret == ptr); + op->transform(src_ptr, &ret, src_alloc_record, &dst_alloc_record, size); + UMPIRE_ASSERT(ret == src_ptr); } umpire::event::record([&](auto& event) { event.name("move") .category(event::category::operation) - .arg("ptr", ptr) + .arg("ptr", src_ptr) .arg("allocator_ref", (void*)allocator.getAllocationStrategy()) .tag("allocator_name", allocator.getName()) .tag("replay", "true") - .arg("result", ptr); + .arg("result", src_ptr); }); - return ptr; + return src_ptr; } } #endif - if (ptr != alloc_record->ptr) { - UMPIRE_ERROR(runtime_error, fmt::format("Cannot move an offset ptr (ptr={}, base={})", ptr, alloc_record->ptr)); + if (src_ptr != alloc_record->ptr) { + UMPIRE_ERROR(runtime_error, fmt::format("Cannot move an offset ptr (ptr={}, base={})", src_ptr, alloc_record->ptr)); } void* dst_ptr{allocator.allocate(alloc_record->size)}; - copy(dst_ptr, ptr); + copy(dst_ptr, src_ptr); - deallocate(ptr); + deallocate(src_ptr); umpire::event::record([&](auto& event) { event.name("move") .category(event::category::operation) - .arg("ptr", ptr) + .arg("ptr", src_ptr) .arg("allocator_ref", (void*)allocator.getAllocationStrategy()) .tag("allocator_name", allocator.getName()) .tag("replay", "true") @@ -885,30 +733,8 @@ camp::resources::EventProxy ResourceManager::prefetch std::ptrdiff_t offset = static_cast(ptr) - static_cast(alloc_record->ptr); std::size_t size = alloc_record->size - offset; - auto platform = alloc_record->strategy->getPlatform(); - - // We need to add a template-based prefetch operation in include/umpire/op/dispatch.hpp - // For now, we'll continue to use the class-based implementation - auto& op_registry = op::MemoryOperationRegistry::getInstance(); - auto op = op_registry.find("PREFETCH", alloc_record->strategy, alloc_record->strategy); - return op->apply_async(ptr, alloc_record, device, size, ctx); - - // In the future, we would have something like: - /* - if (platform == Platform::cuda) { - return umpire::prefetch(static_cast(ptr), device, ctx, size); - } else if (platform == Platform::hip) { - return umpire::prefetch(static_cast(ptr), device, ctx, size); - } else if (platform == Platform::sycl) { - return umpire::prefetch(static_cast(ptr), device, ctx, size); - } else { - UMPIRE_ERROR(runtime_error, - fmt::format("Prefetch not supported for platform: {}", - static_cast(platform))); - // Unreachable, but needed to satisfy compiler - return camp::resources::EventProxy{ctx}; - } - */ + // Use the template-based prefetch operation which will perform the checks and logging internally + return umpire::prefetch(static_cast(ptr), device, ctx, size); } void ResourceManager::deallocate(void* ptr) @@ -1025,4 +851,4 @@ int ResourceManager::getNumDevices() const return device_count; } -} // end of namespace umpire +} // end of namespace umpire \ No newline at end of file diff --git a/src/umpire/ResourceManager.hpp b/src/umpire/ResourceManager.hpp index ae37bb7ce..22c0f4131 100644 --- a/src/umpire/ResourceManager.hpp +++ b/src/umpire/ResourceManager.hpp @@ -26,7 +26,7 @@ namespace umpire { namespace op { class MemoryOperation; -template class Op> struct op_caller; +template class Op> struct op_caller; } @@ -42,6 +42,9 @@ class AllocateNull; * \brief */ class ResourceManager { + // Friend declarations for template operations + template class Op> + friend struct op::op_caller; public: /*! * \brief @@ -339,7 +342,10 @@ class ResourceManager { void* reallocate_impl(void* current_ptr, std::size_t new_size, Allocator allocator, camp::resources::Resource& ctx); + public: util::AllocationMap m_allocations; + + private: std::list> m_allocators; std::vector m_shared_allocator_names; @@ -363,7 +369,7 @@ class ResourceManager { friend strategy::ZeroByteHandler; friend strategy::mixins::AllocateNull; - template class Op> + template class Op> friend struct umpire::op::op_caller; }; From 244940ee658b67453d41d599274841c9910b5c05 Mon Sep 17 00:00:00 2001 From: David Beckingsale Date: Tue, 1 Apr 2025 13:43:17 -0700 Subject: [PATCH 06/26] Checkpoint --- include/umpire/op/cuda.hpp | 541 +++++------------ include/umpire/op/detail/utils.hpp | 18 + include/umpire/op/dispatch.hpp | 901 +++++++++++++--------------- include/umpire/op/hip.hpp | 585 ++++++------------ include/umpire/op/host.hpp | 107 +--- include/umpire/op/openmp_target.hpp | 180 +++--- include/umpire/op/operations.hpp | 224 +++++-- include/umpire/op/sycl.hpp | 139 ++--- 8 files changed, 1072 insertions(+), 1623 deletions(-) create mode 100644 include/umpire/op/detail/utils.hpp diff --git a/include/umpire/op/cuda.hpp b/include/umpire/op/cuda.hpp index 8c5f5d6f4..28136a19f 100644 --- a/include/umpire/op/cuda.hpp +++ b/include/umpire/op/cuda.hpp @@ -1,469 +1,196 @@ #pragma once +#include "umpire/op/detail/utils.hpp" #include "umpire/resource/platform.hpp" -#include "umpire/util/error.hpp" #include "umpire/util/Platform.hpp" +#include "umpire/util/error.hpp" // Forward declaration of kernel for launching directly in device code if needed extern "C" { -__global__ void -umpire_cuda_fill(void* data, int value, std::size_t length); +__global__ void umpire_cuda_fill(void* data, int value, std::size_t length); } namespace { - template - struct get_kind; +// Copy direction mapping via template specialization +template +struct get_kind; - template<> - struct get_kind { - static constexpr cudaMemcpyKind value = cudaMemcpyDeviceToHost; - }; +template <> +struct get_kind { + static constexpr cudaMemcpyKind value = cudaMemcpyDeviceToHost; +}; - template<> - struct get_kind { - static constexpr cudaMemcpyKind value = cudaMemcpyHostToDevice; - }; +template <> +struct get_kind { + static constexpr cudaMemcpyKind value = cudaMemcpyHostToDevice; +}; - template<> - struct get_kind { - static constexpr cudaMemcpyKind value = cudaMemcpyDeviceToDevice; - }; -} +template <> +struct get_kind { + static constexpr cudaMemcpyKind value = cudaMemcpyDeviceToDevice; +}; +} // namespace namespace umpire { namespace op { +// CUDA implementation helpers namespace { - // Helper function to check if a CUDA device supports managed memory features - inline bool check_device_managed_memory(int device) { - cudaDeviceProp properties; - cudaError_t error = ::cudaGetDeviceProperties(&properties, device); - - if (error != cudaSuccess) { - UMPIRE_ERROR(runtime_error, - umpire::fmt::format("cudaGetDeviceProperties for device {} failed with error: {}", - device, cudaGetErrorString(error))); - } - - return (properties.managedMemory == 1 && properties.concurrentManagedAccess == 1); - } - - // Generic function to handle cudaMemAdvise operations - template - inline void advise_impl(T* ptr, std::size_t n, int device, cudaMemoryAdvise advice) { - std::size_t size = n; - if (std::is_same::value) { - // void pointers don't have a size - } else { - size = sizeof(T) * n; - } - - if (check_device_managed_memory(device)) { - cudaError_t error = ::cudaMemAdvise(ptr, size, advice, device); - - if (error != cudaSuccess) { - UMPIRE_ERROR(runtime_error, - umpire::fmt::format("cudaMemAdvise(ptr={}, size={}, advice={}, device={}) failed with error: {}", - ptr, size, static_cast(advice), device, cudaGetErrorString(error))); - } - } - } - - // Generic copy function that handles the different kinds of copies - template - inline void copy_impl(T* src, T* dst, std::size_t len, cudaMemcpyKind kind) { - std::size_t size = len; - if (!std::is_same::value) { - size = sizeof(T) * len; - } - - cudaError_t error = ::cudaMemcpy(dst, src, size, kind); - if (error != cudaSuccess) { - UMPIRE_ERROR( - runtime_error, - umpire::fmt::format( - "cudaMemcpy(dst={}, src={}, size={}, kind={}) failed with error: {}", - dst, src, size, static_cast(kind), cudaGetErrorString(error))); - } - } - - // Async version of copy for use with CUDA streams - template - inline camp::resources::EventProxy copy_async_impl(T* src, T* dst, std::size_t len, camp::resources::Resource& r, cudaMemcpyKind kind) { - auto device = r.try_get(); - if (!device) { - UMPIRE_ERROR(resource_error, umpire::fmt::format("Expected resources::Cuda, got resources::{}", - platform_to_string(r.get_platform()))); - } - auto stream = device->get_stream(); - - std::size_t size = len; - if (!std::is_same::value) { - size = sizeof(T) * len; - } - - cudaError_t error = ::cudaMemcpyAsync(dst, src, size, kind, stream); - - if (error != cudaSuccess) { - UMPIRE_ERROR(runtime_error, - umpire::fmt::format("cudaMemcpyAsync(dst={}, src={}, size={}, kind={}, stream={}) failed with error: {}", - dst, src, size, static_cast(kind), - (void*)stream, cudaGetErrorString(error))); - } - - return camp::resources::EventProxy{r}; - } - - // Generic memset implementation - template - inline void memset_impl(T* ptr, int value, std::size_t len) { - std::size_t size = len; - if (!std::is_same::value) { - size = sizeof(T) * len; - } - - cudaError_t error = ::cudaMemset(ptr, value, size); +// Helper function to check if a CUDA device supports managed memory features +inline bool check_device_managed_memory(int device) +{ + cudaDeviceProp properties; + cudaError_t error = ::cudaGetDeviceProperties(&properties, device); - if (error != cudaSuccess) { - UMPIRE_ERROR(runtime_error, - umpire::fmt::format("cudaMemset(ptr={}, value={}, size={}) failed with error: {}", - ptr, value, size, cudaGetErrorString(error))); - } + if (error != cudaSuccess) { + UMPIRE_ERROR(runtime_error, umpire::fmt::format("cudaGetDeviceProperties for device {} failed with error: {}", + device, cudaGetErrorString(error))); } - - // Async version of memset - template - inline camp::resources::EventProxy memset_async_impl(T* ptr, int value, std::size_t len, camp::resources::Resource& r) { - auto device = r.try_get(); - if (!device) { - UMPIRE_ERROR(resource_error, umpire::fmt::format("Expected resources::Cuda, got resources::{}", - platform_to_string(r.get_platform()))); - } - auto stream = device->get_stream(); - - std::size_t size = len; - if (!std::is_same::value) { - size = sizeof(T) * len; - } - - cudaError_t error = ::cudaMemsetAsync(ptr, value, size, stream); - if (error != cudaSuccess) { - UMPIRE_ERROR(runtime_error, - umpire::fmt::format( - "cudaMemsetAsync(ptr={}, value={}, size={}, stream={}) failed with error: {}", - ptr, value, size, (void*)stream, cudaGetErrorString(error))); - } - - return camp::resources::EventProxy{r}; - } + return (properties.managedMemory == 1 && properties.concurrentManagedAccess == 1); } -// Copy operations for different platform combinations -template<> -struct copy +// Memory advice operation helper +template +inline void advise_impl(T* ptr, std::size_t count, int device, cudaMemoryAdvise advice) { - template - static void exec(T* src, T* dst, std::size_t len) { - copy_impl(src, dst, len, cudaMemcpyDeviceToDevice); - } + if (!check_device_managed_memory(device)) + return; - template - static camp::resources::EventProxy exec(T* src, T* dst, std::size_t len, camp::resources::Resource& r) { - return copy_async_impl(src, dst, len, r, cudaMemcpyDeviceToDevice); - } -}; - -template<> -struct copy -{ - template - static void exec(T* src, T* dst, std::size_t len) { - copy_impl(src, dst, len, cudaMemcpyDeviceToHost); - } + std::size_t size = detail::calculate_size(ptr, count); + cudaError_t error = ::cudaMemAdvise(ptr, size, advice, device); - template - static camp::resources::EventProxy exec(T* src, T* dst, std::size_t len, camp::resources::Resource& r) { - return copy_async_impl(src, dst, len, r, cudaMemcpyDeviceToHost); + if (error != cudaSuccess) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format("cudaMemAdvise(ptr={}, size={}, advice={}, device={}) failed with error: {}", ptr, + size, static_cast(advice), device, cudaGetErrorString(error))); } -}; +} -template<> -struct copy +// CUDA copy implementation +template +inline void copy_impl(T* src, T* dst, std::size_t count, cudaMemcpyKind kind) { - template - static void exec(T* src, T* dst, std::size_t len) { - copy_impl(src, dst, len, cudaMemcpyHostToDevice); - } + std::size_t size = detail::get_size(count); - template - static camp::resources::EventProxy exec(T* src, T* dst, std::size_t len, camp::resources::Resource& r) { - return copy_async_impl(src, dst, len, r, cudaMemcpyHostToDevice); + cudaError_t error = ::cudaMemcpy(dst, src, size, kind); + if (error != cudaSuccess) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format("cudaMemcpy(dst={}, src={}, size={}, kind={}) failed with error: {}", dst, src, + size, static_cast(kind), cudaGetErrorString(error))); } -}; - -// Special handling for void pointers -template<> -template<> -inline void copy::exec(void* src, void* dst, std::size_t len) { - copy_impl(src, dst, len, cudaMemcpyDeviceToDevice); -} - -template<> -template<> -inline void copy::exec(void* src, void* dst, std::size_t len) { - copy_impl(src, dst, len, cudaMemcpyDeviceToHost); -} - -template<> -template<> -inline void copy::exec(void* src, void* dst, std::size_t len) { - copy_impl(src, dst, len, cudaMemcpyHostToDevice); } -// Memset operations -template<> -struct memset +// CUDA async copy implementation +template +inline camp::resources::EventProxy copy_async_impl(T* src, T* dst, std::size_t count, + camp::resources::Resource& r, + cudaMemcpyKind kind) { - template - static void exec(T* src, int val, std::size_t len) { - memset_impl(src, val, len); + auto device = r.try_get(); + if (!device) { + UMPIRE_ERROR(resource_error, umpire::fmt::format("Expected resources::Cuda, got resources::{}", + platform_to_string(r.get_platform()))); } + auto stream = device->get_stream(); + std::size_t size = detail::get_size(count); - template - static camp::resources::EventProxy exec(T* src, int val, std::size_t len, camp::resources::Resource& r) { - return memset_async_impl(src, val, len, r); + cudaError_t error = ::cudaMemcpyAsync(dst, src, size, kind, stream); + if (error != cudaSuccess) { + UMPIRE_ERROR( + runtime_error, + umpire::fmt::format("cudaMemcpyAsync(dst={}, src={}, size={}, kind={}, stream={}) failed with error: {}", dst, + src, size, static_cast(kind), (void*)stream, cudaGetErrorString(error))); } - - // Specialization for void* - template<> - static void exec(void* src, int val, std::size_t len) { - memset_impl(src, val, len); - } - - template<> - static camp::resources::EventProxy exec(void* src, int val, std::size_t len, camp::resources::Resource& r) { - return memset_async_impl(src, val, len, r); - } -}; -// Reallocate implementation -template<> -struct reallocate -{ - template - static T* exec(T* src, std::size_t size) { - if (!src) { - // This should allocate memory, but we can't do that directly here - // since we don't have access to the allocator - return nullptr; - } - - if (size == 0) { - // Should deallocate src and return nullptr - return nullptr; - } - - // This should be handled by the ResourceManager which has access to: - // 1. The AllocationRecord to get the original size - // 2. The Allocator to allocate new memory - - // For now, just return nullptr to indicate this needs to be - // handled at a higher level - return nullptr; - } -}; + return camp::resources::EventProxy{r}; +} -// Memory advice operations -template<> -struct accessed_by +// CUDA memset implementation +template +inline void memset_impl(T* ptr, int value, std::size_t count) { - template - static inline void exec(T* src, int device, std::size_t len) { - advise_impl(src, len, device, cudaMemAdviseSetAccessedBy); - } - - template<> - static inline void exec(void* src, int device, std::size_t len) { - advise_impl(src, len, device, cudaMemAdviseSetAccessedBy); - } -}; + std::size_t size = detail::get_size(count); -template<> -struct preferred_location -{ - template - static inline void exec(T* src, int device, std::size_t len) { - advise_impl(src, len, device, cudaMemAdviseSetPreferredLocation); - } - - template<> - static inline void exec(void* src, int device, std::size_t len) { - advise_impl(src, len, device, cudaMemAdviseSetPreferredLocation); + cudaError_t error = ::cudaMemset(ptr, value, size); + if (error != cudaSuccess) { + UMPIRE_ERROR(runtime_error, umpire::fmt::format("cudaMemset(ptr={}, value={}, size={}) failed with error: {}", ptr, + value, size, cudaGetErrorString(error))); } -}; +} -template<> -struct read_mostly +// CUDA async memset implementation +template +inline camp::resources::EventProxy memset_async_impl(T* ptr, int value, std::size_t count, + camp::resources::Resource& r) { - template - static inline void exec(T* src, int device, std::size_t len) { - advise_impl(src, len, device, cudaMemAdviseSetReadMostly); - } - - template<> - static inline void exec(void* src, int device, std::size_t len) { - advise_impl(src, len, device, cudaMemAdviseSetReadMostly); + auto device = r.try_get(); + if (!device) { + UMPIRE_ERROR(resource_error, umpire::fmt::format("Expected resources::Cuda, got resources::{}", + platform_to_string(r.get_platform()))); } -}; + auto stream = device->get_stream(); + std::size_t size = detail::get_size(count); -template<> -struct unset_accessed_by -{ - template - static inline void exec(T* src, int device, std::size_t len) { - advise_impl(src, len, device, cudaMemAdviseUnsetAccessedBy); - } - - template<> - static inline void exec(void* src, int device, std::size_t len) { - advise_impl(src, len, device, cudaMemAdviseUnsetAccessedBy); + cudaError_t error = ::cudaMemsetAsync(ptr, value, size, stream); + if (error != cudaSuccess) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format("cudaMemsetAsync(ptr={}, value={}, size={}, stream={}) failed with error: {}", ptr, + value, size, (void*)stream, cudaGetErrorString(error))); } -}; -template<> -struct unset_preferred_location -{ - template - static inline void exec(T* src, int device, std::size_t len) { - advise_impl(src, len, device, cudaMemAdviseUnsetPreferredLocation); - } - - template<> - static inline void exec(void* src, int device, std::size_t len) { - advise_impl(src, len, device, cudaMemAdviseUnsetPreferredLocation); - } -}; + return camp::resources::EventProxy{r}; +} -template<> -struct unset_read_mostly +// Prefetch implementation +template +inline void prefetch_impl(T* ptr, int device, std::size_t count) { - template - static inline void exec(T* src, int device, std::size_t len) { - advise_impl(src, len, device, cudaMemAdviseUnsetReadMostly); - } - - template<> - static inline void exec(void* src, int device, std::size_t len) { - advise_impl(src, len, device, cudaMemAdviseUnsetReadMostly); - } -}; + // Use current device for properties if device is CPU + int current_device; + cudaGetDevice(¤t_device); + int gpu = (device != cudaCpuDeviceId) ? device : current_device; -// Prefetch operations -template<> -struct prefetch -{ - template - static void exec(T* src, int device, std::size_t len) { - // Use current device for properties if device is CPU - int current_device; - cudaGetDevice(¤t_device); - int gpu = (device != cudaCpuDeviceId) ? device : current_device; + if (check_device_managed_memory(gpu)) { + std::size_t size = detail::get_size(count); + cudaError_t error = ::cudaMemPrefetchAsync(ptr, size, device, nullptr); - if (check_device_managed_memory(gpu)) { - std::size_t size = len; - if (!std::is_same::value) { - size = sizeof(T) * len; - } - - cudaError_t error = ::cudaMemPrefetchAsync(src, size, device, nullptr); - - if (error != cudaSuccess) { - UMPIRE_ERROR(runtime_error, - umpire::fmt::format("cudaMemPrefetchAsync(ptr={}, size={}, device={}) failed with error: {}", - src, size, device, cudaGetErrorString(error))); - } + if (error != cudaSuccess) { + UMPIRE_ERROR(runtime_error, + umpire::fmt::format("cudaMemPrefetchAsync(ptr={}, size={}, device={}) failed with error: {}", ptr, + size, device, cudaGetErrorString(error))); } } +} - template - static camp::resources::EventProxy exec(T* src, int device, std::size_t len, camp::resources::Resource& r) { - auto cuda_device = r.try_get(); - if (!cuda_device) { - UMPIRE_ERROR(resource_error, umpire::fmt::format("Expected resources::Cuda, got resources::{}", - platform_to_string(r.get_platform()))); - } - auto stream = cuda_device->get_stream(); - - // Use current device for properties if device is CPU - int current_device; - cudaGetDevice(¤t_device); - int gpu = (device != cudaCpuDeviceId) ? device : current_device; - - if (check_device_managed_memory(gpu)) { - std::size_t size = len; - if (!std::is_same::value) { - size = sizeof(T) * len; - } - - cudaError_t error = ::cudaMemPrefetchAsync(src, size, device, stream); - - if (error != cudaSuccess) { - UMPIRE_ERROR(runtime_error, - umpire::fmt::format("cudaMemPrefetchAsync(ptr={}, size={}, device={}, stream={}) failed with error: {}", - src, size, device, (void*)stream, cudaGetErrorString(error))); - } - } - - return camp::resources::EventProxy{r}; +// Async prefetch implementation +template +inline camp::resources::EventProxy prefetch_async_impl(T* ptr, int device, std::size_t count, + camp::resources::Resource& r) +{ + auto cuda_device = r.try_get(); + if (!cuda_device) { + UMPIRE_ERROR(resource_error, umpire::fmt::format("Expected resources::Cuda, got resources::{}", + platform_to_string(r.get_platform()))); } - - // Specializations for void* - template<> - static void exec(void* src, int device, std::size_t len) { - // Use current device for properties if device is CPU - int current_device; - cudaGetDevice(¤t_device); - int gpu = (device != cudaCpuDeviceId) ? device : current_device; + auto stream = cuda_device->get_stream(); - if (check_device_managed_memory(gpu)) { - cudaError_t error = ::cudaMemPrefetchAsync(src, len, device, nullptr); - - if (error != cudaSuccess) { - UMPIRE_ERROR(runtime_error, - umpire::fmt::format("cudaMemPrefetchAsync(ptr={}, size={}, device={}) failed with error: {}", - src, len, device, cudaGetErrorString(error))); - } - } - } - - template<> - static camp::resources::EventProxy exec(void* src, int device, std::size_t len, camp::resources::Resource& r) { - auto cuda_device = r.try_get(); - if (!cuda_device) { - UMPIRE_ERROR(resource_error, umpire::fmt::format("Expected resources::Cuda, got resources::{}", - platform_to_string(r.get_platform()))); - } - auto stream = cuda_device->get_stream(); - - // Use current device for properties if device is CPU - int current_device; - cudaGetDevice(¤t_device); - int gpu = (device != cudaCpuDeviceId) ? device : current_device; + // Use current device for properties if device is CPU + int current_device; + cudaGetDevice(¤t_device); + int gpu = (device != cudaCpuDeviceId) ? device : current_device; - if (check_device_managed_memory(gpu)) { - cudaError_t error = ::cudaMemPrefetchAsync(src, len, device, stream); - - if (error != cudaSuccess) { - UMPIRE_ERROR(runtime_error, - umpire::fmt::format("cudaMemPrefetchAsync(ptr={}, size={}, device={}, stream={}) failed with error: {}", - src, len, device, (void*)stream, cudaGetErrorString(error))); - } - } + if (check_device_managed_memory(gpu)) { + std::size_t size = detail::get_size(count); + cudaError_t error = ::cudaMemPrefetchAsync(ptr, size, device, stream); - return camp::resources::EventProxy{r}; + if (error != cudaSuccess) { + UMPIRE_ERROR( + runtime_error, + umpire::fmt::format("cudaMemPrefetchAsync(ptr={}, size={}, device={}, stream={}) failed with error: {}", ptr, + size, device, (void*)stream, cudaGetErrorString(error))); + } } -}; + return camp::resources::EventProxy{r}; } -} \ No newline at end of file diff --git a/include/umpire/op/detail/utils.hpp b/include/umpire/op/detail/utils.hpp new file mode 100644 index 000000000..c37f85b75 --- /dev/null +++ b/include/umpire/op/detail/utils.hpp @@ -0,0 +1,18 @@ +#pragma once + +namespace umpire { +namespace op { +namespace detail { + +template +inline std::size_t get_size(std::size_t bytes) +{ + if constexpr (std::is_same_v) + return bytes; + else + return bytes * sizeof(T); +} + +} // namespace detail +} // namespace op +} // namespace umpire diff --git a/include/umpire/op/dispatch.hpp b/include/umpire/op/dispatch.hpp index 361fbf91e..db10fb74e 100644 --- a/include/umpire/op/dispatch.hpp +++ b/include/umpire/op/dispatch.hpp @@ -1,195 +1,263 @@ #pragma once +#include "umpire/ResourceManager.hpp" #include "umpire/config.hpp" - #include "umpire/resource/platform.hpp" -#include "umpire/ResourceManager.hpp" - namespace umpire { namespace op { -// Base template for op_caller -template class Op> +// Platform dispatch for single-platform operations +template