From dbdec4392ee2f9a65d55ca0772d7f78b5f309b55 Mon Sep 17 00:00:00 2001 From: Guy Ealey Morag Date: Fri, 5 Jun 2026 11:27:11 +0000 Subject: [PATCH 1/4] UCP/CORE: Fix num_tls overflow detection Signed-off-by: Guy Ealey Morag --- src/ucp/core/ucp_context.c | 90 +++++++++++++++++++++----------------- 1 file changed, 50 insertions(+), 40 deletions(-) diff --git a/src/ucp/core/ucp_context.c b/src/ucp/core/ucp_context.c index 4ae9573c7ba..b35566de7ee 100644 --- a/src/ucp/core/ucp_context.c +++ b/src/ucp/core/ucp_context.c @@ -1294,40 +1294,46 @@ static void ucp_add_tl_resource_if_enabled( uint8_t rsc_flags; ucp_rsc_index_t dev_index, i; - if (ucp_is_resource_enabled(resource, config, aux_tls, &rsc_flags, - dev_cfg_masks, tl_cfg_mask)) { - if ((resource->sys_device != UCS_SYS_DEVICE_ID_UNKNOWN) && - (resource->sys_device >= UCP_MAX_SYS_DEVICES)) { - ucs_diag(UCT_TL_RESOURCE_DESC_FMT - " system device is %d, which exceeds the maximal " - "supported (%d), system locality may be ignored", - UCT_TL_RESOURCE_DESC_ARG(resource), resource->sys_device, - UCP_MAX_SYS_DEVICES); - } - context->tl_rscs[context->num_tls].tl_rsc = *resource; - context->tl_rscs[context->num_tls].md_index = md_index; - context->tl_rscs[context->num_tls].tl_name_csum = - ucs_crc16_string(resource->tl_name); - context->tl_rscs[context->num_tls].flags = rsc_flags; - - dev_index = 0; - for (i = 0; i < context->num_tls; ++i) { - if (ucp_tl_resource_is_same_device(&context->tl_rscs[i].tl_rsc, resource)) { - dev_index = context->tl_rscs[i].dev_index; - break; - } else { - dev_index = ucs_max(context->tl_rscs[i].dev_index + 1, dev_index); - } - } - context->tl_rscs[context->num_tls].dev_index = dev_index; + if (context->num_tls == UINT8_MAX) { + return; + } - if (resource->sys_device < UCP_MAX_SYS_DEVICES) { - md->sys_dev_map |= UCS_BIT(resource->sys_device); + if (!ucp_is_resource_enabled(resource, config, aux_tls, &rsc_flags, + dev_cfg_masks, tl_cfg_mask)) { + return; + } + + if ((resource->sys_device != UCS_SYS_DEVICE_ID_UNKNOWN) && + (resource->sys_device >= UCP_MAX_SYS_DEVICES)) { + ucs_diag(UCT_TL_RESOURCE_DESC_FMT + " system device is %d, which exceeds the maximal " + "supported (%d), system locality may be ignored", + UCT_TL_RESOURCE_DESC_ARG(resource), resource->sys_device, + UCP_MAX_SYS_DEVICES); + } + context->tl_rscs[context->num_tls].tl_rsc = *resource; + context->tl_rscs[context->num_tls].md_index = md_index; + context->tl_rscs[context->num_tls].tl_name_csum = + ucs_crc16_string(resource->tl_name); + context->tl_rscs[context->num_tls].flags = rsc_flags; + + dev_index = 0; + for (i = 0; i < context->num_tls; ++i) { + if (ucp_tl_resource_is_same_device(&context->tl_rscs[i].tl_rsc, resource)) { + dev_index = context->tl_rscs[i].dev_index; + break; + } else { + dev_index = ucs_max(context->tl_rscs[i].dev_index + 1, dev_index); } + } + context->tl_rscs[context->num_tls].dev_index = dev_index; - ++context->num_tls; - ++(*num_resources_p); + if (resource->sys_device < UCP_MAX_SYS_DEVICES) { + md->sys_dev_map |= UCS_BIT(resource->sys_device); } + + ++context->num_tls; + ++(*num_resources_p); } static ucs_status_t @@ -1343,7 +1349,7 @@ ucp_add_tl_resources(ucp_context_h context, ucp_md_index_t md_index, ucp_tl_resource_desc_t *tmp; unsigned num_tl_resources; ucs_status_t status; - ucp_rsc_index_t i; + unsigned i; *num_resources_p = 0; @@ -1696,14 +1702,6 @@ static ucs_status_t ucp_check_resources(ucp_context_h context, return UCS_ERR_NO_DEVICE; } - /* Error check: Make sure there are not too many transports */ - if (context->num_tls >= UCP_MAX_RESOURCES) { - ucs_error("exceeded transports/devices limit " - "(%u requested, up to %d are supported)", - context->num_tls, UCP_MAX_RESOURCES); - return UCS_ERR_EXCEEDS_LIMIT; - } - return ucp_check_tl_names(context); } @@ -1720,11 +1718,11 @@ ucp_add_component_resources(ucp_context_h context, ucp_rsc_index_t cmpt_index, uct_component_attr_t uct_component_attr; unsigned num_tl_resources; ucs_status_t status; - ucp_rsc_index_t i; const uct_md_attr_v2_t *md_attr; unsigned md_index; uint64_t detect_mem_type_mask; uint64_t alloc_mem_type_mask; + unsigned i; /* List memory domain resources */ uct_component_attr.field_mask = UCT_COMPONENT_ATTR_FIELD_MD_RESOURCES | @@ -2201,6 +2199,18 @@ static ucs_status_t ucp_fill_resources(ucp_context_h context, } } + /* Error check: Make sure there are not too many transports. + * UCP_MAX_RESOURCES must be less than UINT8_MAX to detect overflow. */ + UCS_STATIC_ASSERT(UCP_MAX_RESOURCES < UINT8_MAX); + if (context->num_tls > UCP_MAX_RESOURCES) { + ucs_error("exceeded transports/devices limit " + "(%s%u requested, up to %d are supported)", + (context->num_tls == UINT8_MAX) ? ">=" : "", context->num_tls, + UCP_MAX_RESOURCES); + status = UCS_ERR_EXCEEDS_LIMIT; + goto err_free_resources; + } + ucp_fill_resources_reg_md_map_update(context); /* If unified mode is enabled, initialize tl_bitmap to 0. From c9c11b4f405d4e9e6e677393493bdbc69402f337 Mon Sep 17 00:00:00 2001 From: Guy Ealey Morag Date: Wed, 10 Jun 2026 11:26:58 +0000 Subject: [PATCH 2/4] UCP/CORE: Stop parsing devices at UCP_MAX_RESOURCES Signed-off-by: Guy Ealey Morag --- src/ucp/core/ucp_context.c | 37 ++++++++++++++++--------------------- 1 file changed, 16 insertions(+), 21 deletions(-) diff --git a/src/ucp/core/ucp_context.c b/src/ucp/core/ucp_context.c index b35566de7ee..5a1e110dee7 100644 --- a/src/ucp/core/ucp_context.c +++ b/src/ucp/core/ucp_context.c @@ -1284,7 +1284,7 @@ static int ucp_tl_resource_is_same_device(const uct_tl_resource_desc_t *resource (resource1->sys_device == resource2->sys_device)); } -static void ucp_add_tl_resource_if_enabled( +static ucs_status_t ucp_add_tl_resource_if_enabled( ucp_context_h context, ucp_md_index_t md_index, const ucp_config_t *config, const ucs_string_set_t *aux_tls, const uct_tl_resource_desc_t *resource, unsigned *num_resources_p, @@ -1294,13 +1294,15 @@ static void ucp_add_tl_resource_if_enabled( uint8_t rsc_flags; ucp_rsc_index_t dev_index, i; - if (context->num_tls == UINT8_MAX) { - return; - } - if (!ucp_is_resource_enabled(resource, config, aux_tls, &rsc_flags, dev_cfg_masks, tl_cfg_mask)) { - return; + return UCS_OK; + } + + if (context->num_tls >= UCP_MAX_RESOURCES) { + ucs_error("exceeded transports/devices limit (up to %d are supported)", + UCP_MAX_RESOURCES); + return UCS_ERR_EXCEEDS_LIMIT; } if ((resource->sys_device != UCS_SYS_DEVICE_ID_UNKNOWN) && @@ -1334,6 +1336,8 @@ static void ucp_add_tl_resource_if_enabled( ++context->num_tls; ++(*num_resources_p); + + return UCS_OK; } static ucs_status_t @@ -1387,9 +1391,12 @@ ucp_add_tl_resources(ucp_context_h context, ucp_md_index_t md_index, "'%s'(%s)", tl_resources[i].dev_name, context->tl_cmpts[md->cmpt_index].attr.name); ucs_string_set_add(avail_tls, tl_resources[i].tl_name); - ucp_add_tl_resource_if_enabled(context, md_index, config, aux_tls, - &tl_resources[i], num_resources_p, - dev_cfg_masks, tl_cfg_mask); + status = ucp_add_tl_resource_if_enabled(context, md_index, config, aux_tls, + &tl_resources[i], num_resources_p, + dev_cfg_masks, tl_cfg_mask); + if (status != UCS_OK) { + goto free_resources; + } } status = UCS_OK; @@ -2199,18 +2206,6 @@ static ucs_status_t ucp_fill_resources(ucp_context_h context, } } - /* Error check: Make sure there are not too many transports. - * UCP_MAX_RESOURCES must be less than UINT8_MAX to detect overflow. */ - UCS_STATIC_ASSERT(UCP_MAX_RESOURCES < UINT8_MAX); - if (context->num_tls > UCP_MAX_RESOURCES) { - ucs_error("exceeded transports/devices limit " - "(%s%u requested, up to %d are supported)", - (context->num_tls == UINT8_MAX) ? ">=" : "", context->num_tls, - UCP_MAX_RESOURCES); - status = UCS_ERR_EXCEEDS_LIMIT; - goto err_free_resources; - } - ucp_fill_resources_reg_md_map_update(context); /* If unified mode is enabled, initialize tl_bitmap to 0. From 50dd88fc6eae47bb78269a8bfa94d1f9a30393c5 Mon Sep 17 00:00:00 2001 From: Guy Ealey Morag Date: Wed, 10 Jun 2026 11:41:03 +0000 Subject: [PATCH 3/4] UCP/CORE: Fix formatting Signed-off-by: Guy Ealey Morag --- src/ucp/core/ucp_context.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/ucp/core/ucp_context.c b/src/ucp/core/ucp_context.c index 5a1e110dee7..72ea2820e1f 100644 --- a/src/ucp/core/ucp_context.c +++ b/src/ucp/core/ucp_context.c @@ -1284,11 +1284,13 @@ static int ucp_tl_resource_is_same_device(const uct_tl_resource_desc_t *resource (resource1->sys_device == resource2->sys_device)); } -static ucs_status_t ucp_add_tl_resource_if_enabled( - ucp_context_h context, ucp_md_index_t md_index, - const ucp_config_t *config, const ucs_string_set_t *aux_tls, - const uct_tl_resource_desc_t *resource, unsigned *num_resources_p, - uint64_t dev_cfg_masks[], uint64_t *tl_cfg_mask) +static ucs_status_t +ucp_add_tl_resource_if_enabled(ucp_context_h context, ucp_md_index_t md_index, + const ucp_config_t *config, + const ucs_string_set_t *aux_tls, + const uct_tl_resource_desc_t *resource, + unsigned *num_resources_p, + uint64_t dev_cfg_masks[], uint64_t *tl_cfg_mask) { ucp_tl_md_t *md = &context->tl_mds[md_index]; uint8_t rsc_flags; From c26defcc9e0b32b6000258c458c8796d72e733e5 Mon Sep 17 00:00:00 2001 From: Guy Ealey Morag Date: Wed, 10 Jun 2026 13:38:27 +0000 Subject: [PATCH 4/4] UCP/CORE: Fix formatting Signed-off-by: Guy Ealey Morag --- src/ucp/core/ucp_context.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/ucp/core/ucp_context.c b/src/ucp/core/ucp_context.c index 72ea2820e1f..3366a2c75c3 100644 --- a/src/ucp/core/ucp_context.c +++ b/src/ucp/core/ucp_context.c @@ -1303,7 +1303,7 @@ ucp_add_tl_resource_if_enabled(ucp_context_h context, ucp_md_index_t md_index, if (context->num_tls >= UCP_MAX_RESOURCES) { ucs_error("exceeded transports/devices limit (up to %d are supported)", - UCP_MAX_RESOURCES); + UCP_MAX_RESOURCES); return UCS_ERR_EXCEEDS_LIMIT; } @@ -1315,15 +1315,17 @@ ucp_add_tl_resource_if_enabled(ucp_context_h context, ucp_md_index_t md_index, UCT_TL_RESOURCE_DESC_ARG(resource), resource->sys_device, UCP_MAX_SYS_DEVICES); } + context->tl_rscs[context->num_tls].tl_rsc = *resource; context->tl_rscs[context->num_tls].md_index = md_index; - context->tl_rscs[context->num_tls].tl_name_csum = - ucs_crc16_string(resource->tl_name); + context->tl_rscs[context->num_tls].tl_name_csum = ucs_crc16_string( + resource->tl_name); context->tl_rscs[context->num_tls].flags = rsc_flags; dev_index = 0; for (i = 0; i < context->num_tls; ++i) { - if (ucp_tl_resource_is_same_device(&context->tl_rscs[i].tl_rsc, resource)) { + if (ucp_tl_resource_is_same_device(&context->tl_rscs[i].tl_rsc, + resource)) { dev_index = context->tl_rscs[i].dev_index; break; } else { @@ -1393,9 +1395,10 @@ ucp_add_tl_resources(ucp_context_h context, ucp_md_index_t md_index, "'%s'(%s)", tl_resources[i].dev_name, context->tl_cmpts[md->cmpt_index].attr.name); ucs_string_set_add(avail_tls, tl_resources[i].tl_name); - status = ucp_add_tl_resource_if_enabled(context, md_index, config, aux_tls, - &tl_resources[i], num_resources_p, - dev_cfg_masks, tl_cfg_mask); + status = ucp_add_tl_resource_if_enabled(context, md_index, config, + aux_tls, &tl_resources[i], + num_resources_p, dev_cfg_masks, + tl_cfg_mask); if (status != UCS_OK) { goto free_resources; }