Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 23 additions & 2 deletions contrib/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,9 @@ FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG}
ARG OS=${OS:-ubuntu24}
ARG ARCH="x86_64"
ARG DEFAULT_PYTHON_VERSION="3.12"
ARG UCX_REPO="https://github.com/openucx/ucx.git"
ARG UCX_REF="v1.21.x"
ARG UCX_SONAME_SUFFIX=""
ARG BUILD_NIXL_EP="true"
ARG RDMA_CORE_PREFIX="/usr"
ARG UCX_PREFIX="/usr"
Expand Down Expand Up @@ -178,12 +180,29 @@ RUN rm -rf /usr/lib/ucx
RUN rm -rf /opt/hpcx/ucx

RUN cd /usr/local/src && \
git clone https://github.com/openucx/ucx.git && \
git clone "$UCX_REPO" ucx && \
cd ucx && \
echo "=== Using UCX_REF=$UCX_REF ===" && \
git checkout $UCX_REF && \
if [ "$BUILD_NIXL_EP" = "true" ]; then EXPERIMENTAL_API_PARAM="--enable-experimental-api"; else EXPERIMENTAL_API_PARAM=""; fi && \
./autogen.sh && \
if [ -n "$UCX_SONAME_SUFFIX" ]; then \
if ./configure --help | grep -q -- "--with-soname-suffix"; then \
UCX_SONAME_PARAM="--with-soname-suffix=$UCX_SONAME_SUFFIX"; \
else \
echo "UCX_REF=$UCX_REF does not support --with-soname-suffix" >&2; \
exit 1; \
fi; \
if ./configure --help | grep -q -- "--enable-module-deepbind"; then \
UCX_MODULE_DEEPBIND_PARAM="--enable-module-deepbind"; \
else \
echo "UCX_REF=$UCX_REF does not support --enable-module-deepbind" >&2; \
exit 1; \
fi; \
else \
UCX_SONAME_PARAM=""; \
UCX_MODULE_DEEPBIND_PARAM=""; \
fi && \
./contrib/configure-release-mt \
--prefix=$UCX_PREFIX \
--enable-shared \
Expand All @@ -197,7 +216,9 @@ RUN cd /usr/local/src && \
--with-verbs \
--with-dm \
--without-gdrcopy \
--with-efa && \
--with-efa \
$UCX_SONAME_PARAM \
$UCX_MODULE_DEEPBIND_PARAM && \
make -j${NPROC:-$(nproc)} && \
make -j${NPROC:-$(nproc)} install-strip && \
ldconfig
Expand Down
25 changes: 23 additions & 2 deletions contrib/Dockerfile.manylinux
Original file line number Diff line number Diff line change
Expand Up @@ -300,13 +300,32 @@ RUN export UV_INDEX="https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | c
# Upgrade setuptools to latest version for compatibility with PEP 639 (license format)
RUN uv pip install --upgrade 'setuptools>=80.9.0'

ARG UCX_REPO="https://github.com/openucx/ucx.git"
ARG UCX_REF="v1.21.x"
ARG UCX_SONAME_SUFFIX=""
RUN cd /usr/local/src && \
git clone https://github.com/openucx/ucx.git && \
git clone "$UCX_REPO" ucx && \
cd ucx && \
git checkout "${UCX_REF}" && \
git log -1 && \
./autogen.sh && \
if [ -n "$UCX_SONAME_SUFFIX" ]; then \
if ./configure --help | grep -q -- "--with-soname-suffix"; then \
UCX_SONAME_PARAM="--with-soname-suffix=$UCX_SONAME_SUFFIX"; \
else \
echo "UCX_REF=$UCX_REF does not support --with-soname-suffix" >&2; \
exit 1; \
fi; \
if ./configure --help | grep -q -- "--enable-module-deepbind"; then \
UCX_MODULE_DEEPBIND_PARAM="--enable-module-deepbind"; \
else \
echo "UCX_REF=$UCX_REF does not support --enable-module-deepbind" >&2; \
exit 1; \
fi; \
else \
UCX_SONAME_PARAM=""; \
UCX_MODULE_DEEPBIND_PARAM=""; \
fi && \
./contrib/configure-release-mt \
--enable-shared \
--disable-static \
Expand All @@ -319,7 +338,9 @@ RUN cd /usr/local/src && \
--with-verbs \
--with-dm \
--without-gdrcopy \
--with-efa && \
--with-efa \
$UCX_SONAME_PARAM \
$UCX_MODULE_DEEPBIND_PARAM && \
make -j && \
make -j install-strip && \
ldconfig
Expand Down
35 changes: 35 additions & 0 deletions contrib/build-container.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,10 @@ ARCH=$(uname -m)
WHL_BASE=manylinux_2_39
WHL_PLATFORM=${WHL_BASE}_${ARCH}
WHL_PYTHON_VERSIONS="3.12"
UCX_REPO=${UCX_REPO:-https://github.com/openucx/ucx.git}
UCX_REF=${UCX_REF:-v1.21.x}
UCX_SONAME_SUFFIX=${UCX_SONAME_SUFFIX:-}
PRIVATE_UCX_SONAME_SUFFIX="nixl"
BUILD_NIXL_EP="true"
OS="ubuntu24"
NPROC=${NPROC:-$(nproc)}
Expand Down Expand Up @@ -116,6 +119,14 @@ get_options() {
missing_requirement $1
fi
;;
--ucx-repo)
if [ "$2" ]; then
UCX_REPO=$2
shift
else
missing_requirement $1
fi
;;
--ucx-ref)
if [ "$2" ]; then
UCX_REF=$2
Expand All @@ -124,6 +135,17 @@ get_options() {
missing_requirement $1
fi
;;
--ucx-soname-suffix)
if [ "$2" ]; then
UCX_SONAME_SUFFIX=$2
shift
else
missing_requirement $1
fi
;;
--private-ucx)
UCX_SONAME_SUFFIX=${UCX_SONAME_SUFFIX:-$PRIVATE_UCX_SONAME_SUFFIX}
;;
--build-nixl-ep)
BUILD_NIXL_EP=true
;;
Expand Down Expand Up @@ -174,7 +196,15 @@ show_build_options() {
echo "Container arch: ${ARCH}"
echo "Python Versions for wheel build: ${WHL_PYTHON_VERSIONS}"
echo "Wheel Platform: ${WHL_PLATFORM}"
echo "UCX Repo: ${UCX_REPO}"
echo "UCX Ref: ${UCX_REF}"
if [ -n "$UCX_SONAME_SUFFIX" ]; then
echo "UCX SONAME suffix: ${UCX_SONAME_SUFFIX}"
echo "UCX module deepbind: Enabled"
else
echo "UCX SONAME suffix: Disabled"
echo "UCX module deepbind: Disabled"
fi
if [ "$BUILD_NIXL_EP" = "true" ]; then
echo "NIXL EP: Enabled"
else
Expand All @@ -193,7 +223,10 @@ show_help() {
echo " [--build-type [debug|release] to select build type (default: release)]"
echo " [--tag tag for image]"
echo " [--python-versions python versions to build for, comma separated]"
echo " [--ucx-repo ucx git repository URL]"
echo " [--ucx-ref ucx git reference (branch, tag, or sha)]"
echo " [--ucx-soname-suffix suffix to pass to UCX --with-soname-suffix]"
echo " [--private-ucx shortcut for --ucx-soname-suffix ${PRIVATE_UCX_SONAME_SUFFIX}; requires a UCX ref with --with-soname-suffix and --enable-module-deepbind]"
echo " [--build-nixl-ep build NIXL with NIXL EP support (requires UCX >= 1.21)]"
echo " [--arch [x86_64|aarch64] to select target architecture]"
echo " [--dockerfile path to a dockerfile to use]"
Expand All @@ -220,7 +253,9 @@ BUILD_ARGS+=" --build-arg BASE_IMAGE=$BASE_IMAGE --build-arg BASE_IMAGE_TAG=$BAS
BUILD_ARGS+=" --build-arg WHL_PYTHON_VERSIONS=$WHL_PYTHON_VERSIONS"
BUILD_ARGS+=" --build-arg WHL_PLATFORM=$WHL_PLATFORM"
BUILD_ARGS+=" --build-arg ARCH=$ARCH"
BUILD_ARGS+=" --build-arg UCX_REPO=$UCX_REPO"
BUILD_ARGS+=" --build-arg UCX_REF=$UCX_REF"
BUILD_ARGS+=" --build-arg UCX_SONAME_SUFFIX=$UCX_SONAME_SUFFIX"
BUILD_ARGS+=" --build-arg BUILD_NIXL_EP=$BUILD_NIXL_EP"
BUILD_ARGS+=" --build-arg NPROC=$NPROC"
BUILD_ARGS+=" --build-arg GRPC_NPROC=$GRPC_NPROC"
Expand Down
194 changes: 194 additions & 0 deletions contrib/check_ucx_binding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
#!/usr/bin/env python3

# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import argparse
import os
import re
import subprocess
import sys

CHILD_SCRIPT = r"""
import ctypes
import os
import sys

preload = os.environ.get("CHECK_UCX_PRELOAD")
library = os.environ["CHECK_UCX_LIBRARY"]
deepbind = os.environ.get("CHECK_UCX_DEEPBIND") == "1"
call_plugin_init = os.environ.get("CHECK_UCX_CALL_PLUGIN_INIT") == "1"

if preload:
ctypes.CDLL(preload, mode=os.RTLD_NOW | os.RTLD_GLOBAL)

mode = os.RTLD_NOW | os.RTLD_LOCAL
if deepbind:
mode |= getattr(os, "RTLD_DEEPBIND", 0x8)

lib = ctypes.CDLL(library, mode=mode)
if call_plugin_init:
init = lib.nixl_plugin_init
init.restype = ctypes.c_void_p
plugin = init()
if not plugin:
raise RuntimeError("nixl_plugin_init returned null")
try:
fini = lib.nixl_plugin_fini
except AttributeError:
fini = None
if fini is not None:
fini()

sys.stdout.flush()
sys.stderr.flush()
os._exit(0)
"""


def run_checked(cmd, **kwargs):
completed = subprocess.run(cmd, text=True, capture_output=True, **kwargs)
if completed.returncode != 0:
raise RuntimeError(
f"command failed: {' '.join(cmd)}\n"
f"stdout:\n{completed.stdout}\n"
f"stderr:\n{completed.stderr}"
)
return completed.stdout, completed.stderr


def check_needed(library, expected_needed):
if not expected_needed:
return

stdout, _ = run_checked(["readelf", "-d", library])
for needed in expected_needed:
needle = f"Shared library: [{needed}]"
if needle not in stdout:
raise RuntimeError(f"{library} does not NEEDED {needed}")
print(f"needed ok: {needed}")


def parse_expectations(values):
expectations = {}
for value in values:
if "=" not in value:
raise RuntimeError(f"expected SYMBOL=TARGET_SUBSTRING, got {value!r}")
symbol, target = value.split("=", 1)
if not symbol or not target:
raise RuntimeError(f"expected SYMBOL=TARGET_SUBSTRING, got {value!r}")
expectations[symbol] = target
return expectations


def find_bindings(debug_output, source_name):
binding_re = re.compile(
r"binding file (?P<source>.+?) \[[^\]]+\] to (?P<target>.+?) "
r"\[[^\]]+\]: .* symbol [`'](?P<symbol>[^`']+)[`']"
)
bindings = {}
for line in debug_output.splitlines():
match = binding_re.search(line)
if not match:
continue
if source_name not in match.group("source"):
continue
bindings[match.group("symbol")] = match.group("target")
return bindings


def check_bindings(args):
expectations = parse_expectations(args.expect_binding)
if args.expected_soname and not args.call_nixl_plugin_init:
raise RuntimeError("--expected-soname requires --call-nixl-plugin-init")

if not expectations and not args.call_nixl_plugin_init:
return

env = os.environ.copy()
if expectations:
env["LD_DEBUG"] = "bindings"
env["CHECK_UCX_LIBRARY"] = args.library
env["CHECK_UCX_DEEPBIND"] = "1" if args.deepbind else "0"
if args.preload_ucx:
env["CHECK_UCX_PRELOAD"] = args.preload_ucx
if args.call_nixl_plugin_init:
env["CHECK_UCX_CALL_PLUGIN_INIT"] = "1"
if args.expected_soname:
env["NIXL_UCX_EXPECTED_SONAME"] = args.expected_soname

completed = subprocess.run(
[sys.executable, "-c", CHILD_SCRIPT],
text=True,
capture_output=True,
env=env,
)
output = completed.stdout + completed.stderr
if completed.returncode != 0:
raise RuntimeError(
f"binding probe failed with exit code {completed.returncode}\n{output}"
)

if expectations:
source_name = os.path.basename(args.library)
bindings = find_bindings(output, source_name)
for symbol, expected_target in expectations.items():
target = bindings.get(symbol)
if target is None:
raise RuntimeError(
f"no LD_DEBUG binding found for {source_name}:{symbol}"
)
if expected_target not in target:
raise RuntimeError(
f"{source_name}:{symbol} bound to {target}, expected {expected_target}"
)
print(f"binding ok: {symbol} -> {target}")


def main():
parser = argparse.ArgumentParser(
description="Check UCX dynamic linkage and glibc symbol binding for a NIXL artifact."
)
parser.add_argument("--library", required=True, help="Library or plugin to dlopen")
parser.add_argument(
"--preload-ucx",
help="External libucp.so.0 to load with RTLD_GLOBAL before loading --library",
)
parser.add_argument(
"--deepbind",
action="store_true",
help="Load --library with RTLD_DEEPBIND",
)
parser.add_argument(
"--call-nixl-plugin-init",
action="store_true",
help="Call nixl_plugin_init after dlopen; useful for libplugin_UCX.so",
)
parser.add_argument(
"--expected-soname",
help="Set NIXL_UCX_EXPECTED_SONAME while calling nixl_plugin_init",
)
parser.add_argument(
"--expect-needed",
action="append",
default=[],
help="Require a DT_NEEDED entry, for example libucp-nixl.so.0",
)
parser.add_argument(
"--expect-binding",
action="append",
default=[],
help="Require SYMBOL to bind to a target substring, for example ucp_init_version=libucp-nixl",
)
args = parser.parse_args()

check_needed(args.library, args.expect_needed)
check_bindings(args)


if __name__ == "__main__":
try:
main()
except Exception as exc:
print(f"error: {exc}", file=sys.stderr)
sys.exit(1)
Loading
Loading