Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 50 additions & 40 deletions src/ucp/core/ucp_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -1294,40 +1294,46 @@ static void ucp_add_tl_resource_if_enabled(
uint8_t rsc_flags;
ucp_rsc_index_t dev_index, i;

if (ucp_is_resource_enabled(resource, config, aux_tls, &rsc_flags,
dev_cfg_masks, tl_cfg_mask)) {
if ((resource->sys_device != UCS_SYS_DEVICE_ID_UNKNOWN) &&
(resource->sys_device >= UCP_MAX_SYS_DEVICES)) {
ucs_diag(UCT_TL_RESOURCE_DESC_FMT
" system device is %d, which exceeds the maximal "
"supported (%d), system locality may be ignored",
UCT_TL_RESOURCE_DESC_ARG(resource), resource->sys_device,
UCP_MAX_SYS_DEVICES);
}
context->tl_rscs[context->num_tls].tl_rsc = *resource;
context->tl_rscs[context->num_tls].md_index = md_index;
context->tl_rscs[context->num_tls].tl_name_csum =
ucs_crc16_string(resource->tl_name);
context->tl_rscs[context->num_tls].flags = rsc_flags;

dev_index = 0;
for (i = 0; i < context->num_tls; ++i) {
if (ucp_tl_resource_is_same_device(&context->tl_rscs[i].tl_rsc, resource)) {
dev_index = context->tl_rscs[i].dev_index;
break;
} else {
dev_index = ucs_max(context->tl_rscs[i].dev_index + 1, dev_index);
}
}
context->tl_rscs[context->num_tls].dev_index = dev_index;
if (context->num_tls == UINT8_MAX) {
Comment thread
rakhmets marked this conversation as resolved.
Outdated
Comment thread
rakhmets marked this conversation as resolved.
Outdated
return;
}

if (resource->sys_device < UCP_MAX_SYS_DEVICES) {
md->sys_dev_map |= UCS_BIT(resource->sys_device);
if (!ucp_is_resource_enabled(resource, config, aux_tls, &rsc_flags,
dev_cfg_masks, tl_cfg_mask)) {
return;
}

if ((resource->sys_device != UCS_SYS_DEVICE_ID_UNKNOWN) &&
(resource->sys_device >= UCP_MAX_SYS_DEVICES)) {
ucs_diag(UCT_TL_RESOURCE_DESC_FMT
" system device is %d, which exceeds the maximal "
"supported (%d), system locality may be ignored",
UCT_TL_RESOURCE_DESC_ARG(resource), resource->sys_device,
UCP_MAX_SYS_DEVICES);
}
context->tl_rscs[context->num_tls].tl_rsc = *resource;
context->tl_rscs[context->num_tls].md_index = md_index;
context->tl_rscs[context->num_tls].tl_name_csum =
ucs_crc16_string(resource->tl_name);
Comment thread
rakhmets marked this conversation as resolved.
Outdated
context->tl_rscs[context->num_tls].flags = rsc_flags;

dev_index = 0;
for (i = 0; i < context->num_tls; ++i) {
if (ucp_tl_resource_is_same_device(&context->tl_rscs[i].tl_rsc, resource)) {
Comment thread
rakhmets marked this conversation as resolved.
Outdated
dev_index = context->tl_rscs[i].dev_index;
break;
} else {
dev_index = ucs_max(context->tl_rscs[i].dev_index + 1, dev_index);
}
}
context->tl_rscs[context->num_tls].dev_index = dev_index;

++context->num_tls;
++(*num_resources_p);
if (resource->sys_device < UCP_MAX_SYS_DEVICES) {
md->sys_dev_map |= UCS_BIT(resource->sys_device);
}

++context->num_tls;
++(*num_resources_p);
}

static ucs_status_t
Expand All @@ -1343,7 +1349,7 @@ ucp_add_tl_resources(ucp_context_h context, ucp_md_index_t md_index,
ucp_tl_resource_desc_t *tmp;
unsigned num_tl_resources;
ucs_status_t status;
ucp_rsc_index_t i;
unsigned i;

*num_resources_p = 0;

Expand Down Expand Up @@ -1696,14 +1702,6 @@ static ucs_status_t ucp_check_resources(ucp_context_h context,
return UCS_ERR_NO_DEVICE;
}

/* Error check: Make sure there are not too many transports */
if (context->num_tls >= UCP_MAX_RESOURCES) {
ucs_error("exceeded transports/devices limit "
"(%u requested, up to %d are supported)",
context->num_tls, UCP_MAX_RESOURCES);
return UCS_ERR_EXCEEDS_LIMIT;
}

return ucp_check_tl_names(context);
}

Expand All @@ -1720,11 +1718,11 @@ ucp_add_component_resources(ucp_context_h context, ucp_rsc_index_t cmpt_index,
uct_component_attr_t uct_component_attr;
unsigned num_tl_resources;
ucs_status_t status;
ucp_rsc_index_t i;
const uct_md_attr_v2_t *md_attr;
unsigned md_index;
uint64_t detect_mem_type_mask;
uint64_t alloc_mem_type_mask;
unsigned i;

/* List memory domain resources */
uct_component_attr.field_mask = UCT_COMPONENT_ATTR_FIELD_MD_RESOURCES |
Expand Down Expand Up @@ -2201,6 +2199,18 @@ static ucs_status_t ucp_fill_resources(ucp_context_h context,
}
}

/* Error check: Make sure there are not too many transports.
* UCP_MAX_RESOURCES must be less than UINT8_MAX to detect overflow. */
UCS_STATIC_ASSERT(UCP_MAX_RESOURCES < UINT8_MAX);
if (context->num_tls > UCP_MAX_RESOURCES) {
Comment thread
iyastreb marked this conversation as resolved.
Outdated
ucs_error("exceeded transports/devices limit "
"(%s%u requested, up to %d are supported)",
(context->num_tls == UINT8_MAX) ? ">=" : "", context->num_tls,
UCP_MAX_RESOURCES);
status = UCS_ERR_EXCEEDS_LIMIT;

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will this change cause ucp_context initialization to fail on a machine with 256 devices?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but as I explained on the issue thread, the infinite loop that you found prevented the error message from appearing, it was always meant to fail.

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

However, this design seems a bit unreasonable to me. I'm only using one of the devices, yet I can't use it just because there are too many devices on the machine. Is there any workaround for this? After all, I can't ask the customers to remove their net device.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Well, you can limit amount of devices with UCX_NET_DEVICES, or transports with UCX_TLS.
But I agree, that we should probably just print a warning and continue work.
Maybe @brminich @yosefe have opinion on that?

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@iyastreb Actually, I have restricted the devices and transport protocols using UCX_NET_DEVICES=mlx5_0:1 and UCX_TLS=rc,tcp, but the device scanning process doesn't seem to be limited by UCX_NET_DEVICES.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ivanallen It seems that the restrictions from UCX_NET_DEVICES and UCX_TLS would come into effect when the infinite loop bug is resolved.

@ivanallen ivanallen Jun 9, 2026

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@guy-ealey-morag But it returns error here because there are too many devices, so it won't even get the chance to take effect, right?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ivanallen The function ucp_add_tl_resource_if_enabled would add a device only if it's included in the given filters (like those mentioned above), so in your case it would fail only if you don't use any filters.

goto err_free_resources;
}

ucp_fill_resources_reg_md_map_update(context);

/* If unified mode is enabled, initialize tl_bitmap to 0.
Expand Down