From 17616e466a457ed8a355aa701ce657d7bcd37c2c Mon Sep 17 00:00:00 2001 From: Raul Akhmetshin Date: Fri, 29 May 2026 16:02:10 +0300 Subject: [PATCH 1/3] UCT/CUDA/CUDA_IPC: Separated get and put remote cache methods. --- src/uct/cuda/cuda_ipc/cuda_ipc_cache.c | 115 +++++++++++++++---------- src/uct/cuda/cuda_ipc/cuda_ipc_cache.h | 9 +- src/uct/cuda/cuda_ipc/cuda_ipc_iface.c | 17 ++-- 3 files changed, 81 insertions(+), 60 deletions(-) diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c b/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c index f1484418abe..e4c6accbb69 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c @@ -10,16 +10,17 @@ #include "cuda_ipc_cache.h" #include "cuda_ipc_iface.h" +#include "cuda_ipc.inl" + +#include #include #include #include #include #include #include -#include +#include #include -#include "cuda_ipc.inl" - typedef struct uct_cuda_ipc_cache_hash_key { pid_t pid; ucs_sys_ns_t pid_ns; @@ -59,7 +60,7 @@ KHASH_INIT(cuda_ipc_rem_cache, uct_cuda_ipc_cache_hash_key_t, */ typedef struct uct_cuda_ipc_remote_cache { khash_t(cuda_ipc_rem_cache) hash; - ucs_recursive_spinlock_t lock; + ucs_rw_spinlock_t lock; unsigned long max_regions; /**< Global max regions limit */ size_t max_size; /**< Global max total size limit */ } uct_cuda_ipc_remote_cache_t; @@ -516,49 +517,75 @@ static void uct_cuda_ipc_cache_invalidate_regions(uct_cuda_ipc_cache_t *cache, cache->name, from, to); } -static ucs_status_t +static int uct_cuda_ipc_get_remote_cache(const uct_cuda_ipc_cache_hash_key_t *key, - uct_cuda_ipc_cache_t **cache) + uct_cuda_ipc_cache_t **cache_p) { - ucs_status_t status = UCS_OK; + khint_t it; + int found; + + ucs_rw_spinlock_read_lock(&uct_cuda_ipc_remote_cache.lock); + + it = kh_get(cuda_ipc_rem_cache, &uct_cuda_ipc_remote_cache.hash, *key); + found = (it != kh_end(&uct_cuda_ipc_remote_cache.hash)); + if (found) { + *cache_p = kh_val(&uct_cuda_ipc_remote_cache.hash, it); + } + + ucs_rw_spinlock_read_unlock(&uct_cuda_ipc_remote_cache.lock); + return found; +} + +static ucs_status_t +uct_cuda_ipc_put_remote_cache(const uct_cuda_ipc_cache_hash_key_t *key, + uct_cuda_ipc_cache_t **cache_p) +{ + int ret; + khint_t it; + ucs_status_t status; char target_name[64]; - khiter_t khiter; - int khret; - ucs_recursive_spin_lock(&uct_cuda_ipc_remote_cache.lock); + if (uct_cuda_ipc_get_remote_cache(key, cache_p)) { + return UCS_OK; + } + + ucs_rw_spinlock_write_lock(&uct_cuda_ipc_remote_cache.lock); - khiter = kh_put(cuda_ipc_rem_cache, &uct_cuda_ipc_remote_cache.hash, *key, - &khret); - if ((khret == UCS_KH_PUT_BUCKET_EMPTY) || - (khret == UCS_KH_PUT_BUCKET_CLEAR)) { - ucs_snprintf_safe(target_name, sizeof(target_name), "dest:%d:%u:%d", - key->pid, key->pid_ns, key->cu_device); - status = uct_cuda_ipc_create_cache(cache, target_name); - if (status != UCS_OK) { - kh_del(cuda_ipc_rem_cache, &uct_cuda_ipc_remote_cache.hash, khiter); - ucs_error("could not create create cuda ipc cache: %s", - ucs_status_string(status)); - goto err_unlock; - } + it = kh_put(cuda_ipc_rem_cache, &uct_cuda_ipc_remote_cache.hash, *key, + &ret); + if (ret == UCS_KH_PUT_FAILED) { + ucs_error("failed to allocate cuda_ipc remote_cache hash entry"); + status = UCS_ERR_NO_MEMORY; + goto out_unlock; + } - kh_val(&uct_cuda_ipc_remote_cache.hash, khiter) = *cache; - } else if (khret == UCS_KH_PUT_KEY_PRESENT) { - *cache = kh_val(&uct_cuda_ipc_remote_cache.hash, khiter); - } else { - ucs_error("unable to use cuda_ipc remote_cache hash"); - status = UCS_ERR_NO_RESOURCE; + ucs_assertv_always(ret != UCS_KH_PUT_KEY_PRESENT, "key %d:%u:%d is present", + key->pid, key->pid_ns, key->cu_device); + ucs_assertv_always((ret == UCS_KH_PUT_BUCKET_EMPTY) || + (ret == UCS_KH_PUT_BUCKET_CLEAR), + "invalid return value: %d", ret); + + ucs_snprintf_safe(target_name, sizeof(target_name), "dest:%d:%u:%d", + key->pid, key->pid_ns, key->cu_device); + status = uct_cuda_ipc_create_cache(cache_p, target_name); + if (status != UCS_OK) { + kh_del(cuda_ipc_rem_cache, &uct_cuda_ipc_remote_cache.hash, it); + ucs_error("could not create create cuda ipc cache: %s", + ucs_status_string(status)); + goto out_unlock; } -err_unlock: - ucs_recursive_spin_unlock(&uct_cuda_ipc_remote_cache.lock); + + kh_val(&uct_cuda_ipc_remote_cache.hash, it) = *cache_p; + +out_unlock: + ucs_rw_spinlock_write_unlock(&uct_cuda_ipc_remote_cache.lock); return status; } -ucs_status_t uct_cuda_ipc_unmap_memhandle(pid_t pid, ucs_sys_ns_t pid_ns, - uintptr_t d_bptr, - const void *mapped_addr, - CUdevice cu_dev, int cache_enabled) +void uct_cuda_ipc_unmap_memhandle(pid_t pid, ucs_sys_ns_t pid_ns, + uintptr_t d_bptr, const void *mapped_addr, + CUdevice cu_dev, int cache_enabled) { - ucs_status_t status = UCS_OK; const uct_cuda_ipc_cache_hash_key_t key = {pid, pid_ns, cu_dev}; uct_cuda_ipc_cache_t *cache; ucs_pgt_region_t *pgt_region; @@ -569,12 +596,13 @@ ucs_status_t uct_cuda_ipc_unmap_memhandle(pid_t pid, ucs_sys_ns_t pid_ns, * see uct_cuda_ipc_map_memhandle for more details */ if ((d_bptr == (uintptr_t)mapped_addr) && uct_cuda_ipc_is_rkey_local(pid, pid_ns)) { - return UCS_OK; + return; } - status = uct_cuda_ipc_get_remote_cache(&key, &cache); - if (status != UCS_OK) { - return status; + if (!uct_cuda_ipc_get_remote_cache(&key, &cache)) { + ucs_debug("no remote cache found for key: %d:%u:%d", pid, pid_ns, + cu_dev); + return; } /* use write lock because cache maybe modified */ @@ -592,7 +620,6 @@ ucs_status_t uct_cuda_ipc_unmap_memhandle(pid_t pid, ucs_sys_ns_t pid_ns, } pthread_rwlock_unlock(&cache->lock); - return status; } UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_ipc_map_memhandle, @@ -626,7 +653,7 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_ipc_map_memhandle, return UCS_OK; } - status = uct_cuda_ipc_get_remote_cache(&hash_key, &cache); + status = uct_cuda_ipc_put_remote_cache(&hash_key, &cache); if (status != UCS_OK) { return status; } @@ -822,7 +849,7 @@ void uct_cuda_ipc_cache_set_global_limits(unsigned long max_regions, } UCS_STATIC_INIT { - ucs_recursive_spinlock_init(&uct_cuda_ipc_remote_cache.lock, 0); + ucs_rw_spinlock_init(&uct_cuda_ipc_remote_cache.lock); kh_init_inplace(cuda_ipc_rem_cache, &uct_cuda_ipc_remote_cache.hash); uct_cuda_ipc_remote_cache.max_regions = ULONG_MAX; uct_cuda_ipc_remote_cache.max_size = SIZE_MAX; @@ -855,5 +882,5 @@ UCS_STATIC_CLEANUP { uct_cuda_ipc_destroy_cache(rem_cache); }) kh_destroy_inplace(cuda_ipc_rem_cache, &uct_cuda_ipc_remote_cache.hash); - ucs_recursive_spinlock_destroy(&uct_cuda_ipc_remote_cache.lock); + ucs_rw_spinlock_cleanup(&uct_cuda_ipc_remote_cache.lock); } diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_cache.h b/src/uct/cuda/cuda_ipc/cuda_ipc_cache.h index 4aeabf9eeee..d822f1e06d6 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_cache.h +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_cache.h @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2018. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2018-2026. ALL RIGHTS RESERVED. * * See file LICENSE for terms. */ @@ -68,10 +68,9 @@ ucs_status_t uct_cuda_ipc_map_memhandle(uct_cuda_ipc_extended_rkey_t *key, ucs_log_level_t log_level); -ucs_status_t uct_cuda_ipc_unmap_memhandle(pid_t pid, ucs_sys_ns_t pid_ns, - uintptr_t d_bptr, - const void *mapped_addr, - CUdevice cu_dev, int cache_enabled); +void uct_cuda_ipc_unmap_memhandle(pid_t pid, ucs_sys_ns_t pid_ns, + uintptr_t d_bptr, const void *mapped_addr, + CUdevice cu_dev, int cache_enabled); /** diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c index ef41ec5263c..8ecfecb5f07 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_iface.c @@ -1,5 +1,5 @@ /** - * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2018-2019. ALL RIGHTS RESERVED. + * Copyright (c) NVIDIA CORPORATION & AFFILIATES, 2018-2026. ALL RIGHTS RESERVED. * See file LICENSE for terms. */ @@ -312,17 +312,12 @@ static void uct_cuda_ipc_complete_event(uct_iface_h tl_iface, uct_cuda_ipc_iface_t); uct_cuda_ipc_event_desc_t *cuda_ipc_event = ucs_derived_of(cuda_event, uct_cuda_ipc_event_desc_t); - ucs_status_t status; - status = uct_cuda_ipc_unmap_memhandle(cuda_ipc_event->pid, - cuda_ipc_event->pid_ns, - cuda_ipc_event->d_bptr, - cuda_ipc_event->mapped_addr, - cuda_ipc_event->cuda_device, - iface->config.enable_cache); - if (status != UCS_OK) { - ucs_fatal("failed to unmap addr:%p", cuda_ipc_event->mapped_addr); - } + uct_cuda_ipc_unmap_memhandle(cuda_ipc_event->pid, cuda_ipc_event->pid_ns, + cuda_ipc_event->d_bptr, + cuda_ipc_event->mapped_addr, + cuda_ipc_event->cuda_device, + iface->config.enable_cache); } static uct_iface_ops_t uct_cuda_ipc_iface_ops = { From 15d51a10097988612a7b8b661e6601416aab9039 Mon Sep 17 00:00:00 2001 From: Raul Akhmetshin Date: Tue, 2 Jun 2026 11:29:40 +0300 Subject: [PATCH 2/3] UCT/CUDA/CUDA_IPC: Addressed thread safety issue. --- src/uct/cuda/cuda_ipc/cuda_ipc_cache.c | 180 ++++++++++++++----------- 1 file changed, 98 insertions(+), 82 deletions(-) diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c b/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c index 780344c952f..bbbb080bdf5 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c @@ -514,94 +514,69 @@ static void uct_cuda_ipc_cache_invalidate_regions(uct_cuda_ipc_cache_t *cache, cache->name, from, to); } -static int -uct_cuda_ipc_get_remote_cache(const uct_cuda_ipc_cache_hash_key_t *key, - uct_cuda_ipc_cache_t **cache_p) +static uct_cuda_ipc_cache_t * +uct_cuda_ipc_remote_cache_get(uct_cuda_ipc_cache_hash_key_t key) { - khint_t it; - int found; - - ucs_rw_spinlock_read_lock(&uct_cuda_ipc_remote_cache.lock); + const khint_t it = kh_get(cuda_ipc_rem_cache, + &uct_cuda_ipc_remote_cache.hash, key); - it = kh_get(cuda_ipc_rem_cache, &uct_cuda_ipc_remote_cache.hash, *key); - found = (it != kh_end(&uct_cuda_ipc_remote_cache.hash)); - if (found) { - *cache_p = kh_val(&uct_cuda_ipc_remote_cache.hash, it); + if (it != kh_end(&uct_cuda_ipc_remote_cache.hash)) { + return kh_value(&uct_cuda_ipc_remote_cache.hash, it); } - ucs_rw_spinlock_read_unlock(&uct_cuda_ipc_remote_cache.lock); - return found; + return NULL; } static ucs_status_t -uct_cuda_ipc_put_remote_cache(const uct_cuda_ipc_cache_hash_key_t *key, +uct_cuda_ipc_remote_cache_put(uct_cuda_ipc_cache_hash_key_t key, uct_cuda_ipc_cache_t **cache_p) { + uct_cuda_ipc_cache_t *cache; int ret; khint_t it; - ucs_status_t status; char target_name[64]; + ucs_status_t status; - if (uct_cuda_ipc_get_remote_cache(key, cache_p)) { - return UCS_OK; + cache = uct_cuda_ipc_remote_cache_get(key); + if (cache != NULL) { + goto out; } - ucs_rw_spinlock_write_lock(&uct_cuda_ipc_remote_cache.lock); - - it = kh_put(cuda_ipc_rem_cache, &uct_cuda_ipc_remote_cache.hash, *key, - &ret); + it = kh_put(cuda_ipc_rem_cache, &uct_cuda_ipc_remote_cache.hash, key, &ret); if (ret == UCS_KH_PUT_FAILED) { ucs_error("failed to allocate cuda_ipc remote_cache hash entry"); - status = UCS_ERR_NO_MEMORY; - goto out_unlock; + return UCS_ERR_NO_MEMORY; } ucs_assertv_always(ret != UCS_KH_PUT_KEY_PRESENT, "key %d:%u:%d is present", - key->pid, key->pid_ns, key->cu_device); + key.pid, key.pid_ns, key.cu_device); ucs_assertv_always((ret == UCS_KH_PUT_BUCKET_EMPTY) || (ret == UCS_KH_PUT_BUCKET_CLEAR), "invalid return value: %d", ret); ucs_snprintf_safe(target_name, sizeof(target_name), "dest:%d:%u:%d", - key->pid, key->pid_ns, key->cu_device); - status = uct_cuda_ipc_create_cache(cache_p, target_name); + key.pid, key.pid_ns, key.cu_device); + status = uct_cuda_ipc_create_cache(&cache, target_name); if (status != UCS_OK) { kh_del(cuda_ipc_rem_cache, &uct_cuda_ipc_remote_cache.hash, it); - ucs_error("could not create create cuda ipc cache: %s", - ucs_status_string(status)); - goto out_unlock; + ucs_error("failed to create create cuda ipc cache: %s", target_name); + return status; } - kh_val(&uct_cuda_ipc_remote_cache.hash, it) = *cache_p; + kh_val(&uct_cuda_ipc_remote_cache.hash, it) = cache; -out_unlock: - ucs_rw_spinlock_write_unlock(&uct_cuda_ipc_remote_cache.lock); - return status; +out: + *cache_p = cache; + return UCS_OK; } -void uct_cuda_ipc_unmap_memhandle(pid_t pid, ucs_sys_ns_t pid_ns, - uintptr_t d_bptr, const void *mapped_addr, - CUdevice cu_dev, int cache_enabled) +static void +uct_cuda_ipc_cache_destroy_region(uct_cuda_ipc_cache_t *cache, uintptr_t d_bptr, + const void *mapped_addr, int cache_enabled) { - const uct_cuda_ipc_cache_hash_key_t key = {pid, pid_ns, cu_dev}; - uct_cuda_ipc_cache_t *cache; ucs_pgt_region_t *pgt_region; uct_cuda_ipc_cache_region_t *region; - /* checking if the mapped address is the same as the d_bptr - * this is true for the case of single process memory mapping - * see uct_cuda_ipc_map_memhandle for more details */ - if ((d_bptr == (uintptr_t)mapped_addr) && - uct_cuda_ipc_is_rkey_local(pid, pid_ns)) { - return; - } - - if (!uct_cuda_ipc_get_remote_cache(&key, &cache)) { - ucs_debug("no remote cache found for key: %d:%u:%d", pid, pid_ns, - cu_dev); - return; - } - /* use write lock because cache maybe modified */ pthread_rwlock_wrlock(&cache->lock); pgt_region = UCS_PROFILE_CALL(ucs_pgtable_lookup, &cache->pgtable, d_bptr); @@ -619,41 +594,43 @@ void uct_cuda_ipc_unmap_memhandle(pid_t pid, ucs_sys_ns_t pid_ns, pthread_rwlock_unlock(&cache->lock); } -UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_ipc_map_memhandle, - (ext_key, cu_dev, mapped_addr, log_level), - uct_cuda_ipc_extended_rkey_t *ext_key, CUdevice cu_dev, - void **mapped_addr, ucs_log_level_t log_level) +void uct_cuda_ipc_unmap_memhandle(pid_t pid, ucs_sys_ns_t pid_ns, + uintptr_t d_bptr, const void *mapped_addr, + CUdevice cu_dev, int cache_enabled) { - uct_cuda_ipc_rkey_t *key = &ext_key->super; - const uct_cuda_ipc_cache_hash_key_t hash_key = {key->pid, ext_key->pid_ns, - cu_dev}; + const uct_cuda_ipc_cache_hash_key_t key = {pid, pid_ns, cu_dev}; uct_cuda_ipc_cache_t *cache; - ucs_status_t status; - ucs_pgt_region_t *pgt_region; - uct_cuda_ipc_cache_region_t *region; - CUuuid uuid; - int ret; - status = UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGetUuid(&uuid, cu_dev)); - if (status != UCS_OK) { - return status; + /* checking if the mapped address is the same as the d_bptr + * this is true for the case of single process memory mapping + * see uct_cuda_ipc_map_memhandle for more details */ + if ((d_bptr == (uintptr_t)mapped_addr) && + uct_cuda_ipc_is_rkey_local(pid, pid_ns)) { + return; } - if (uct_cuda_ipc_is_rkey_local(key->pid, ext_key->pid_ns) && - (memcmp(uuid.bytes, key->uuid.bytes, sizeof(uuid.bytes)) == 0)) { - /* TODO: added for test purpose to enable cuda_ipc tests in gtest - * mapped addrr is set to be same as d_bptr avoiding any calls to - * uct_cuda_ipc_open_memhandle which would fail with invalid argument - * error - */ - *mapped_addr = (CUdeviceptr*)key->d_bptr; - return UCS_OK; + ucs_rw_spinlock_read_lock(&uct_cuda_ipc_remote_cache.lock); + cache = uct_cuda_ipc_remote_cache_get(key); + if (cache != NULL) { + uct_cuda_ipc_cache_destroy_region(cache, d_bptr, mapped_addr, + cache_enabled); + } else { + ucs_debug("no remote cache found for key: %d:%u:%d", pid, pid_ns, + cu_dev); } - status = uct_cuda_ipc_put_remote_cache(&hash_key, &cache); - if (status != UCS_OK) { - return status; - } + ucs_rw_spinlock_read_unlock(&uct_cuda_ipc_remote_cache.lock); +} + +static ucs_status_t +uct_cuda_ipc_cache_put_region(uct_cuda_ipc_cache_t *cache, + uct_cuda_ipc_rkey_t *key, CUdevice cu_dev, + void **mapped_addr, ucs_log_level_t log_level) +{ + ucs_pgt_region_t *pgt_region; + uct_cuda_ipc_cache_region_t *region; + ucs_status_t status; + int ret; pthread_rwlock_wrlock(&cache->lock); pgt_region = UCS_PROFILE_CALL(ucs_pgtable_lookup, @@ -780,6 +757,45 @@ UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_ipc_map_memhandle, return status; } +UCS_PROFILE_FUNC(ucs_status_t, uct_cuda_ipc_map_memhandle, + (ext_key, cu_dev, mapped_addr, log_level), + uct_cuda_ipc_extended_rkey_t *ext_key, CUdevice cu_dev, + void **mapped_addr, ucs_log_level_t log_level) +{ + uct_cuda_ipc_rkey_t *key = &ext_key->super; + const uct_cuda_ipc_cache_hash_key_t hash_key = {key->pid, ext_key->pid_ns, + cu_dev}; + uct_cuda_ipc_cache_t *cache; + ucs_status_t status; + CUuuid uuid; + + status = UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGetUuid(&uuid, cu_dev)); + if (status != UCS_OK) { + return status; + } + + if (uct_cuda_ipc_is_rkey_local(key->pid, ext_key->pid_ns) && + (memcmp(uuid.bytes, key->uuid.bytes, sizeof(uuid.bytes)) == 0)) { + /* TODO: added for test purpose to enable cuda_ipc tests in gtest + * mapped addrr is set to be same as d_bptr avoiding any calls to + * uct_cuda_ipc_open_memhandle which would fail with invalid argument + * error + */ + *mapped_addr = (CUdeviceptr*)key->d_bptr; + return UCS_OK; + } + + ucs_rw_spinlock_write_lock(&uct_cuda_ipc_remote_cache.lock); + status = uct_cuda_ipc_remote_cache_put(hash_key, &cache); + if (status == UCS_OK) { + status = uct_cuda_ipc_cache_put_region(cache, key, cu_dev, mapped_addr, + log_level); + } + + ucs_rw_spinlock_write_unlock(&uct_cuda_ipc_remote_cache.lock); + return status; +} + ucs_status_t uct_cuda_ipc_create_cache(uct_cuda_ipc_cache_t **cache, const char *name) { @@ -864,7 +880,7 @@ void uct_cuda_ipc_destroy_cache_by_iface_address( return; } - ucs_recursive_spin_lock(&uct_cuda_ipc_remote_cache.lock); + ucs_rw_spinlock_write_lock(&uct_cuda_ipc_remote_cache.lock); for (device_index = 0; device_index < num_devices; ++device_index) { status = UCT_CUDADRV_FUNC_LOG_WARN( @@ -886,7 +902,7 @@ void uct_cuda_ipc_destroy_cache_by_iface_address( kh_del(cuda_ipc_rem_cache, &uct_cuda_ipc_remote_cache.hash, khiter); } - ucs_recursive_spin_unlock(&uct_cuda_ipc_remote_cache.lock); + ucs_rw_spinlock_write_unlock(&uct_cuda_ipc_remote_cache.lock); } UCS_STATIC_INIT { From cc9c5811251e4d8ac726f0d23dbaeac90eb3db08 Mon Sep 17 00:00:00 2001 From: Raul Akhmetshin Date: Fri, 5 Jun 2026 17:23:25 +0300 Subject: [PATCH 3/3] UCT/CUDA/CUDA_IPC: Fixed error string. --- src/uct/cuda/cuda_ipc/cuda_ipc_cache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c b/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c index bbbb080bdf5..fc76a6cc08a 100644 --- a/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c +++ b/src/uct/cuda/cuda_ipc/cuda_ipc_cache.c @@ -559,7 +559,8 @@ uct_cuda_ipc_remote_cache_put(uct_cuda_ipc_cache_hash_key_t key, status = uct_cuda_ipc_create_cache(&cache, target_name); if (status != UCS_OK) { kh_del(cuda_ipc_rem_cache, &uct_cuda_ipc_remote_cache.hash, it); - ucs_error("failed to create create cuda ipc cache: %s", target_name); + ucs_error("failed to create cuda ipc cache: %s", + ucs_status_string(status)); return status; }