From ac395b56995fc85340d3b846bc3025f8bdfa3462 Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Wed, 3 Jun 2026 00:00:07 +0300 Subject: [PATCH 1/7] BUILD: add rapidsai CI wrapper images for UCXX UCXX tests run in rapidsai/ci-conda and ci-wheel base images. Thin wrappers open /opt/conda and /pyenv so the Azure-injected step user can use them, and add gdb so ucxx's timeout_with_stack.py can capture stacks on hangs. --- buildlib/dockers/rapidsai-ci-conda.Dockerfile | 10 ++++++++++ buildlib/dockers/rapidsai-ci-wheel.Dockerfile | 10 ++++++++++ 2 files changed, 20 insertions(+) create mode 100644 buildlib/dockers/rapidsai-ci-conda.Dockerfile create mode 100644 buildlib/dockers/rapidsai-ci-wheel.Dockerfile diff --git a/buildlib/dockers/rapidsai-ci-conda.Dockerfile b/buildlib/dockers/rapidsai-ci-conda.Dockerfile new file mode 100644 index 00000000000..3dc3e032466 --- /dev/null +++ b/buildlib/dockers/rapidsai-ci-conda.Dockerfile @@ -0,0 +1,10 @@ +# Azure wrapper around rapidsai/ci-conda: chmod /opt/conda so the non-root UID Azure runs +# steps as can use conda/python (rapidsai owns it as root); + adds gdb for stack capture. + +ARG BASE_IMAGE=rapidsai/ci-conda:26.06-latest +FROM ${BASE_IMAGE} + +RUN chmod -R o+rwX /opt/conda \ + && apt-get update \ + && apt-get install -y --no-install-recommends gdb \ + && rm -rf /var/lib/apt/lists/* diff --git a/buildlib/dockers/rapidsai-ci-wheel.Dockerfile b/buildlib/dockers/rapidsai-ci-wheel.Dockerfile new file mode 100644 index 00000000000..99dfd2bf90d --- /dev/null +++ b/buildlib/dockers/rapidsai-ci-wheel.Dockerfile @@ -0,0 +1,10 @@ +# Azure wrapper around rapidsai/ci-wheel: chmod /pyenv so the non-root UID Azure runs +# steps as can write there (rapidsai owns it as root); + adds gdb for stack capture. +# Default base = cuda13; cuda12 image built by overriding BASE_IMAGE to the cuda12.9.1 tag. + +ARG BASE_IMAGE=rapidsai/ci-wheel:26.06-cuda13.2.0-rockylinux8-py3.11 +FROM ${BASE_IMAGE} + +RUN chmod -R o+rwX /pyenv \ + && dnf install -y gdb \ + && dnf clean all From 12be9b77f3b9531cb427073f3ecafcf6808c2a45 Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Wed, 3 Jun 2026 00:00:08 +0300 Subject: [PATCH 2/7] AZP: add UCXX build + test stages to the PR pipeline Pull rapidsai/ucxx as a pipeline resource and add two stages gated on Static_check: UCXX_build (conda + wheel packages, docs, devcontainer, checks) then UCXX_tests (conda C++/Python on the CPU + GPU matrix). Covers x86_64 + aarch64, CUDA 12 + 13; GPU tests on amd64/cuda13. distributed-ucxx excluded (not upstreamed). --- buildlib/tools/build_ucxx.sh | 104 +++++++++++++++++++++++++++++++++++ buildlib/tools/test_ucxx.sh | 92 +++++++++++++++++++++++++++++++ 2 files changed, 196 insertions(+) create mode 100755 buildlib/tools/build_ucxx.sh create mode 100755 buildlib/tools/test_ucxx.sh diff --git a/buildlib/tools/build_ucxx.sh b/buildlib/tools/build_ucxx.sh new file mode 100755 index 00000000000..b4f4ffdf0ed --- /dev/null +++ b/buildlib/tools/build_ucxx.sh @@ -0,0 +1,104 @@ +#!/bin/bash -eE +# +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See file LICENSE for terms. +# +# Usage: build_ucxx.sh +# Env: UCXX_DIR (all phases). Build phases also need RAPIDS_CUDA_VERSION, +# RAPIDS_PY_VERSION, RAPIDS_BLD_OUTPUT_DIR. +# wheel_ucxx phase also requires WHEEL_INPUT_DIR (libucxx wheel artifact dir) +# Docs phase env: CPP_CHANNEL_DIR, PYTHON_CHANNEL_DIR, RAPIDS_DOCS_DIR + +phase=${1:?phase required} +: "${UCXX_DIR:?UCXX_DIR required}" + +case "$phase" in + devcontainer) + # Parse each .devcontainer config; verify its Dockerfile + BASE exist + # (no registry pull - the devcontainer CLI catches missing images at use). + UCXX_DIR="$UCXX_DIR" python3 - <<'PY' +import glob, json, os, sys +root = os.environ["UCXX_DIR"] +cfgs = glob.glob(os.path.join(root, ".devcontainer", "*", "devcontainer.json")) +if not cfgs: + sys.exit("ERROR: no devcontainer.json under .devcontainer/") +for cfg in cfgs: + b = json.load(open(cfg))["build"] + df = b["dockerfile"].replace("${localWorkspaceFolder}", root) + assert os.path.isfile(df), f"{cfg}: missing Dockerfile {df}" + assert b["args"]["BASE"], f"{cfg}: empty BASE" + print(f"OK {cfg}") +PY + exit 0 ;; +esac + +: "${RAPIDS_CUDA_VERSION:?RAPIDS_CUDA_VERSION required}" +: "${RAPIDS_PY_VERSION:?RAPIDS_PY_VERSION required}" +: "${RAPIDS_BLD_OUTPUT_DIR:?RAPIDS_BLD_OUTPUT_DIR required}" + +export RAPIDS_CUDA_VERSION RAPIDS_PY_VERSION +mkdir -p "$RAPIDS_BLD_OUTPUT_DIR" + +case "$phase" in + conda_*) export RAPIDS_CONDA_BLD_OUTPUT_DIR="$RAPIDS_BLD_OUTPUT_DIR" ;; + wheel_*) export RAPIDS_WHEEL_BLD_OUTPUT_DIR="$RAPIDS_BLD_OUTPUT_DIR" ;; + docs) + : "${CPP_CHANNEL_DIR:?CPP_CHANNEL_DIR required for docs phase}" + : "${PYTHON_CHANNEL_DIR:?PYTHON_CHANNEL_DIR required for docs phase}" + : "${RAPIDS_DOCS_DIR:?RAPIDS_DOCS_DIR required for docs phase}" + mkdir -p "$RAPIDS_DOCS_DIR" ;; +esac + +mkdir -p "$HOME/.local/bin" +for tool in rapids-download-conda-from-github rapids-download-from-github; do + printf '#!/bin/bash\necho "%s"\n' "$RAPIDS_BLD_OUTPUT_DIR" > "$HOME/.local/bin/$tool" + chmod +x "$HOME/.local/bin/$tool" +done +# Docs phase: override shims to point at the staged conda channels. +if [ "$phase" = "docs" ]; then + printf '#!/bin/bash\necho "%s"\n' "$CPP_CHANNEL_DIR" > "$HOME/.local/bin/rapids-download-conda-from-github" + printf '#!/bin/bash\necho "%s"\n' "$PYTHON_CHANNEL_DIR" > "$HOME/.local/bin/rapids-download-from-github" +fi + +if [ -n "${WHEEL_INPUT_DIR:-}" ]; then + printf '#!/bin/bash\necho "%s"\n' "$WHEEL_INPUT_DIR" > "$HOME/.local/bin/rapids-download-wheels-from-github" + chmod +x "$HOME/.local/bin/rapids-download-wheels-from-github" +fi + +export PATH="$HOME/.local/bin:$PATH" + +cd "$UCXX_DIR" + +# Wheel builds otherwise pick system gcc 8.5 (too old for libucxx's C++20); +# point CC/CXX at gcc-toolset-14. +if [[ "$phase" == wheel_* ]]; then + toolset=/opt/rh/gcc-toolset-14/root/usr/bin + [ -x "$toolset/gcc" ] \ + || { echo "ERROR: gcc-toolset-14 not found at $toolset (needed for libucxx C++20)" >&2; exit 1; } + export CC="$toolset/gcc" CXX="$toolset/g++" +fi + +# Upstream ucxx header uses usleep() but omits ; undeclared on +# newer gcc. Affects all C++ phases. +if [[ "$phase" != docs ]]; then + hdr=python/ucxx/ucxx/examples/python_future_task.h + grep -q "include " "$hdr" || sed -i '/^#pragma once/a #include ' "$hdr" +fi + +case "$phase" in + conda_cpp) bash ci/build_cpp.sh ;; + conda_python) bash ci/build_python.sh ;; + wheel_libucxx) bash ci/build_wheel_libucxx.sh ;; + wheel_ucxx) + : "${WHEEL_INPUT_DIR:?WHEEL_INPUT_DIR required for wheel_ucxx (libucxx wheel dir)}" + bash ci/build_wheel_ucxx.sh ;; + docs) + # Upstream forces RAPIDS_DOCS_DIR=$(mktemp -d); make it default-if-unset + # so our staged output dir survives. Guard catches upstream rewording. + sed -i 's|RAPIDS_DOCS_DIR="$(mktemp -d)"|: "${RAPIDS_DOCS_DIR:=$(mktemp -d)}"|' ci/build_docs.sh + grep -q 'RAPIDS_DOCS_DIR:=' ci/build_docs.sh \ + || { echo "ERROR: docs patch did not apply to ci/build_docs.sh" >&2; exit 1; } + bash ci/build_docs.sh ;; + *) echo "Unknown phase: $phase" >&2; exit 1 ;; +esac diff --git a/buildlib/tools/test_ucxx.sh b/buildlib/tools/test_ucxx.sh new file mode 100755 index 00000000000..cc6d2aaeca2 --- /dev/null +++ b/buildlib/tools/test_ucxx.sh @@ -0,0 +1,92 @@ +#!/bin/bash -eE +# +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# See file LICENSE for terms. +# +# Usage: test_ucxx.sh +# Env: RAPIDS_CUDA_VERSION, RAPIDS_PY_VERSION, UCXX_DIR +# build|test_cpp|test_python: also IS_GPU +# test_wheel_ucxx: also LIBUCXX_WHL_DIR, UCXX_WHL_DIR + +phase=${1:?phase required} +case "$phase" in + build|test_cpp|test_python) : "${IS_GPU:?IS_GPU required}" ;; +esac +: "${RAPIDS_CUDA_VERSION:?RAPIDS_CUDA_VERSION required}" +: "${RAPIDS_PY_VERSION:?RAPIDS_PY_VERSION required}" +: "${UCXX_DIR:?UCXX_DIR required}" + +export RAPIDS_CUDA_VERSION RAPIDS_PY_VERSION +export RAPIDS_CONDA_BLD_OUTPUT_DIR=/tmp/conda-bld-output +mkdir -p "$RAPIDS_CONDA_BLD_OUTPUT_DIR" "$HOME/.local/bin" + +for tool in rapids-download-conda-from-github rapids-download-from-github; do + printf '#!/bin/bash\necho "%s"\n' "$RAPIDS_CONDA_BLD_OUTPUT_DIR" > "$HOME/.local/bin/$tool" + chmod +x "$HOME/.local/bin/$tool" +done +export PATH="$HOME/.local/bin:$PATH" + +cd "$UCXX_DIR" + +# Tolerate missing nvidia-smi on CPU containers. Guard catches upstream rewording. +sed -i 's#^ nvidia-smi$# command -v nvidia-smi >/dev/null \&\& nvidia-smi || echo "(no GPU)"#' ci/test_common.sh +grep -q 'command -v nvidia-smi' ci/test_common.sh \ + || { echo "ERROR: nvidia-smi patch did not apply to ci/test_common.sh" >&2; exit 1; } + +# Skip test_client_shutdown: its teardown crashes the xdist worker under +# full-pipeline GPU/MPS contention (flaky upstream test, not a UCX issue). +# Guard catches upstream rewording (else the skip silently disappears). +sed -i "s#--runslow#--runslow -k 'not test_client_shutdown'#" ci/run_python.sh +grep -q "not test_client_shutdown" ci/run_python.sh \ + || { echo "ERROR: test_client_shutdown skip did not apply to ci/run_python.sh" >&2; exit 1; } + +# Force host driver ahead of the image's newer compat driver (MPS rejects a client +# newer than the daemon -> cuInit hangs). ubuntu: /usr/lib/-linux-gnu; wheel: /usr/lib64. +arch=$(uname -m) +for hostlib in "/usr/lib/$arch-linux-gnu" /usr/lib64; do + [ -d "$hostlib" ] && export LD_LIBRARY_PATH="$hostlib:${LD_LIBRARY_PATH:-}" +done + +case "$phase" in + build) + # Upstream ucxx examples header uses usleep() but omits ; + # undeclared on newer gcc. Same patch as build_ucxx.sh. + hdr=python/ucxx/ucxx/examples/python_future_task.h + grep -q "include " "$hdr" || sed -i '/^#pragma once/a #include ' "$hdr" + if [ "${IS_GPU,,}" = "true" ]; then + # sccache wrapper crashes CMake's compiler probe on the GPU build hosts; no-op it. + cat > "$HOME/.local/bin/rapids-configure-sccache" <<'EOF' +#!/bin/bash +export CMAKE_C_COMPILER_LAUNCHER= CMAKE_CXX_COMPILER_LAUNCHER= CMAKE_CUDA_COMPILER_LAUNCHER= RUSTC_WRAPPER= +EOF + chmod +x "$HOME/.local/bin/rapids-configure-sccache" + fi + bash ci/build_cpp.sh + bash ci/build_python.sh + ;; + + test_cpp) + # CPU slices have no GPU device bound; CUDA-touching gtests would crash. + if [ "${IS_GPU,,}" = "true" ]; then + bash ci/test_cpp.sh + else + CUDA_VISIBLE_DEVICES= UCX_TLS=tcp,sm,self GTEST_FILTER='-RMM*.*:CCCL*.*' \ + bash ci/test_cpp.sh + fi + ;; + + test_python) + bash ci/test_python.sh + ;; + + test_wheel_ucxx) + : "${LIBUCXX_WHL_DIR:?LIBUCXX_WHL_DIR required}" + : "${UCXX_WHL_DIR:?UCXX_WHL_DIR required}" + printf '#!/bin/bash\necho "%s"\n' "$LIBUCXX_WHL_DIR" > "$HOME/.local/bin/rapids-download-wheels-from-github" + printf '#!/bin/bash\necho "%s"\n' "$UCXX_WHL_DIR" > "$HOME/.local/bin/rapids-download-from-github" + chmod +x "$HOME/.local/bin/rapids-download-wheels-from-github" "$HOME/.local/bin/rapids-download-from-github" + bash ci/test_wheel_ucxx.sh + ;; + + *) echo "Unknown phase: $phase" >&2; exit 1 ;; +esac From b73790358a535862420cf85a90ef28c6ef94c069 Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Wed, 3 Jun 2026 00:00:08 +0300 Subject: [PATCH 3/7] AZP: add UCXX build/test driver scripts build_ucxx.sh and test_ucxx.sh wrap UCXX's ci/*.sh entrypoints for the Azure agents: stage rapids download shims, set the wheel toolchain, run the conda/wheel build, C++ gtest and Python test phases. CPU slices disable CUDA-only gtests; GPU slices force the host CUDA driver so cuInit matches the MPS daemon. test_client_shutdown is skipped (flaky teardown under MPS contention). --- buildlib/azure-pipelines-pr.yml | 8 + buildlib/pr/main.yml | 67 ++++++++ buildlib/pr/ucxx_build.yml | 277 ++++++++++++++++++++++++++++++++ buildlib/pr/ucxx_tests.yml | 48 ++++++ 4 files changed, 400 insertions(+) create mode 100644 buildlib/pr/ucxx_build.yml create mode 100644 buildlib/pr/ucxx_tests.yml diff --git a/buildlib/azure-pipelines-pr.yml b/buildlib/azure-pipelines-pr.yml index d446fe72ea7..44ccf60fa5f 100644 --- a/buildlib/azure-pipelines-pr.yml +++ b/buildlib/azure-pipelines-pr.yml @@ -31,5 +31,13 @@ pr: - buildlib/tools/perf_results.py - buildlib/tools/perf-common.yml +resources: + repositories: + - repository: ucxx + type: github + name: rapidsai/ucxx + endpoint: Mellanox-lab + ref: refs/heads/main + extends: template: pr/main.yml diff --git a/buildlib/pr/main.yml b/buildlib/pr/main.yml index 0e087877a5b..f4b667fb0c9 100644 --- a/buildlib/pr/main.yml +++ b/buildlib/pr/main.yml @@ -261,6 +261,27 @@ resources: - container: centos10stream image: rdmz-harbor.rdmz.labs.mlnx/hpcx/x86_64/centos10stream/builder:inbox options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) + - container: ucxx_rapidsai_ci_conda + # Thin wrapper of rapidsai/ci-conda; see buildlib/dockers/rapidsai-ci-conda.Dockerfile. + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-conda:26.06-azp-1 + options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) + - container: ucxx_rapidsai_ci_conda_gpu + # No IB/host-net: with IB, UCX binds rc_mlx5 and the AM/tag tests hang here. + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-conda:26.06-azp-1 + options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) --gpus all --ipc=host + # Wheel images are CUDA-pinned: one base per CUDA version (see rapidsai-ci-wheel.Dockerfile). + - container: ucxx_rapidsai_ci_wheel_cuda13 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-wheel:26.06-cuda13-azp-1 + options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) + - container: ucxx_rapidsai_ci_wheel_cuda13_gpu + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-wheel:26.06-cuda13-azp-1 + options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) --gpus all --ipc=host + - container: ucxx_rapidsai_ci_wheel_cuda12 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-wheel:26.06-cuda12-azp-1 + options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) + - container: ucxx_rapidsai_ci_wheel_cuda12_gpu + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-wheel:26.06-cuda12-azp-1 + options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) --gpus all --ipc=host stages: - stage: Codestyle @@ -349,6 +370,52 @@ stages: demands: ucx_docker -equals yes container: coverity_rh7 + - template: ucxx_build.yml + parameters: + dependsOn: [Static_check] + # Wheel GPU tests run on x86 only (no arm64 GPU runner); cuda12 + cuda13. + wheel_tests_ucxx_slices: + - { name: x86_64_cuda12_py311, container: ucxx_rapidsai_ci_wheel_cuda12_gpu, libucxx_slice: x86_64_cuda12_py311, ucxx_slice: x86_64_cuda12_py311, + demands: ucx_gpu, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: x86_64_cuda13_py311, container: ucxx_rapidsai_ci_wheel_cuda13_gpu, libucxx_slice: x86_64_cuda13_py311, ucxx_slice: x86_64_cuda13_py311, + demands: ucx_gpu, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + conda_cpp_slices: + - { name: x86_64_cuda12_py311, demands: ucx_docker, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: x86_64_cuda13_py311, demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + - { name: aarch64_cuda12_py311, demands: ucx_arm64, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: aarch64_cuda13_py311, demands: ucx_arm64, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + conda_python_slices: + - { name: x86_64_cuda12_py311, demands: ucx_docker, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: x86_64_cuda13_py311, demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + - { name: aarch64_cuda12_py311, demands: ucx_arm64, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: aarch64_cuda13_py311, demands: ucx_arm64, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + # libucxx + ucxx wheels for cuda12 + cuda13, x86_64 + aarch64. + wheel_libucxx_slices: + - { name: x86_64_cuda12_py311, container: ucxx_rapidsai_ci_wheel_cuda12, demands: ucx_docker, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: x86_64_cuda13_py311, container: ucxx_rapidsai_ci_wheel_cuda13, demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + - { name: aarch64_cuda12_py311, container: ucxx_rapidsai_ci_wheel_cuda12, demands: ucx_arm64, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: aarch64_cuda13_py311, container: ucxx_rapidsai_ci_wheel_cuda13, demands: ucx_arm64, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + wheel_ucxx_slices: + - { name: x86_64_cuda12_py311, container: ucxx_rapidsai_ci_wheel_cuda12, libucxx_slice: x86_64_cuda12_py311, demands: ucx_docker, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: x86_64_cuda13_py311, container: ucxx_rapidsai_ci_wheel_cuda13, libucxx_slice: x86_64_cuda13_py311, demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + - { name: aarch64_cuda12_py311, container: ucxx_rapidsai_ci_wheel_cuda12, libucxx_slice: aarch64_cuda12_py311, demands: ucx_arm64, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: aarch64_cuda13_py311, container: ucxx_rapidsai_ci_wheel_cuda13, libucxx_slice: aarch64_cuda13_py311, demands: ucx_arm64, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + docs_slices: + - { name: x86_64_cuda13_py311, cpp_slice: x86_64_cuda13_py311, python_slice: x86_64_cuda13_py311, + demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + devcontainer_slices: + - { name: x86_64_cuda13_py311, demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + + - template: ucxx_tests.yml + parameters: + dependsOn: [Static_check] + slices: + - { name: x86_64_cuda13_py313, gpu: true, demands: ucx_gpu, rapids_cuda_version: '13.2.0', rapids_py_version: '3.13' } + - { name: x86_64_cuda12_py311, gpu: false, demands: ucx_docker, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: x86_64_cuda13_py311, gpu: false, demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + - { name: aarch64_cuda12_py311, gpu: false, demands: ucx_arm64, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: aarch64_cuda13_py311, gpu: false, demands: ucx_arm64, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + - stage: Tests dependsOn: [Basic_compile] jobs: diff --git a/buildlib/pr/ucxx_build.yml b/buildlib/pr/ucxx_build.yml new file mode 100644 index 00000000000..39757bbe66d --- /dev/null +++ b/buildlib/pr/ucxx_build.yml @@ -0,0 +1,277 @@ +parameters: + dependsOn: [Static_check] + conda_container: ucxx_rapidsai_ci_conda + conda_cpp_slices: [] + conda_python_slices: [] + wheel_libucxx_slices: [] + wheel_ucxx_slices: [] + wheel_tests_ucxx_slices: [] + docs_slices: [] + devcontainer_slices: [] + +stages: + - stage: UCXX_build + dependsOn: ${{ parameters.dependsOn }} + variables: + UCX_DIR: $(Agent.BuildDirectory)/ucx + UCXX_DIR: $(Agent.BuildDirectory)/ucxx + RAPIDS_BLD_OUTPUT_DIR: $(Build.ArtifactStagingDirectory) + jobs: + - ${{ each slice in parameters.conda_cpp_slices }}: + - job: ucxx_conda_cpp_build_${{ slice.name }} + workspace: + clean: all + pool: + name: MLNX + demands: ${{ slice.demands }} + displayName: 'UCXX conda-cpp-build (${{ slice.name }})' + container: ${{ parameters.conda_container }} + timeoutInMinutes: 60 + variables: + RAPIDS_CUDA_VERSION: ${{ slice.rapids_cuda_version }} + RAPIDS_PY_VERSION: ${{ slice.rapids_py_version }} + steps: + - checkout: self + path: ucx + fetchDepth: 100 + retryCountOnTaskFailure: 5 + - checkout: ucxx + path: ucxx + retryCountOnTaskFailure: 5 + - bash: bash $(UCX_DIR)/buildlib/tools/build_ucxx.sh conda_cpp + displayName: Build UCXX conda C++ package + - task: PublishBuildArtifacts@1 + displayName: Publish conda-cpp artifact + inputs: + pathToPublish: $(RAPIDS_BLD_OUTPUT_DIR) + artifactName: ucxx-conda-cpp-${{ slice.name }} + + - ${{ each slice in parameters.conda_python_slices }}: + - job: ucxx_conda_python_build_${{ slice.name }} + workspace: + clean: all + pool: + name: MLNX + demands: ${{ slice.demands }} + displayName: 'UCXX conda-python-build (${{ slice.name }})' + container: ${{ parameters.conda_container }} + timeoutInMinutes: 60 + dependsOn: ucxx_conda_cpp_build_${{ slice.name }} + variables: + RAPIDS_CUDA_VERSION: ${{ slice.rapids_cuda_version }} + RAPIDS_PY_VERSION: ${{ slice.rapids_py_version }} + steps: + - checkout: self + path: ucx + fetchDepth: 100 + retryCountOnTaskFailure: 5 + - checkout: ucxx + path: ucxx + retryCountOnTaskFailure: 5 + - task: DownloadBuildArtifacts@1 + displayName: Fetch conda-cpp artifact + inputs: + buildType: current + artifactName: ucxx-conda-cpp-${{ slice.name }} + downloadPath: $(System.DefaultWorkingDirectory)/_dl + - bash: | + rm -rf "$(RAPIDS_BLD_OUTPUT_DIR)" + mv "$(System.DefaultWorkingDirectory)/_dl/ucxx-conda-cpp-${{ slice.name }}" "$(RAPIDS_BLD_OUTPUT_DIR)" + displayName: Stage conda-cpp artifact + - bash: bash $(UCX_DIR)/buildlib/tools/build_ucxx.sh conda_python + displayName: Build UCXX conda Python package + - task: PublishBuildArtifacts@1 + displayName: Publish conda-python artifact + inputs: + pathToPublish: $(RAPIDS_BLD_OUTPUT_DIR) + artifactName: ucxx-conda-python-${{ slice.name }} + + - ${{ each slice in parameters.wheel_libucxx_slices }}: + - job: ucxx_wheel_libucxx_build_${{ slice.name }} + workspace: + clean: all + pool: + name: MLNX + demands: ${{ slice.demands }} + displayName: 'UCXX wheel-build-libucxx (${{ slice.name }})' + container: ${{ slice.container }} + timeoutInMinutes: 60 + variables: + RAPIDS_CUDA_VERSION: ${{ slice.rapids_cuda_version }} + RAPIDS_PY_VERSION: ${{ slice.rapids_py_version }} + steps: + - checkout: self + path: ucx + fetchDepth: 100 + retryCountOnTaskFailure: 5 + - checkout: ucxx + path: ucxx + retryCountOnTaskFailure: 5 + - bash: bash $(UCX_DIR)/buildlib/tools/build_ucxx.sh wheel_libucxx + displayName: Build libucxx wheel + - task: PublishBuildArtifacts@1 + displayName: Publish libucxx wheel artifact + inputs: + pathToPublish: $(RAPIDS_BLD_OUTPUT_DIR) + artifactName: ucxx-wheel-libucxx-${{ slice.name }} + + - ${{ each slice in parameters.wheel_ucxx_slices }}: + - job: ucxx_wheel_ucxx_build_${{ slice.name }} + workspace: + clean: all + pool: + name: MLNX + demands: ${{ slice.demands }} + displayName: 'UCXX wheel-build-ucxx (${{ slice.name }})' + container: ${{ slice.container }} + timeoutInMinutes: 60 + dependsOn: ucxx_wheel_libucxx_build_${{ slice.libucxx_slice }} + variables: + RAPIDS_CUDA_VERSION: ${{ slice.rapids_cuda_version }} + RAPIDS_PY_VERSION: ${{ slice.rapids_py_version }} + WHEEL_INPUT_DIR: $(System.DefaultWorkingDirectory)/wheel-libucxx + steps: + - checkout: self + path: ucx + fetchDepth: 100 + retryCountOnTaskFailure: 5 + - checkout: ucxx + path: ucxx + retryCountOnTaskFailure: 5 + - task: DownloadBuildArtifacts@1 + displayName: Fetch libucxx wheel artifact + inputs: + buildType: current + artifactName: ucxx-wheel-libucxx-${{ slice.libucxx_slice }} + downloadPath: $(System.DefaultWorkingDirectory)/_dl + - bash: | + rm -rf "$(WHEEL_INPUT_DIR)" + mv "$(System.DefaultWorkingDirectory)/_dl/ucxx-wheel-libucxx-${{ slice.libucxx_slice }}" "$(WHEEL_INPUT_DIR)" + displayName: Stage libucxx wheel artifact + - bash: bash $(UCX_DIR)/buildlib/tools/build_ucxx.sh wheel_ucxx + displayName: Build ucxx wheel + - task: PublishBuildArtifacts@1 + displayName: Publish ucxx wheel artifact + inputs: + pathToPublish: $(RAPIDS_BLD_OUTPUT_DIR) + artifactName: ucxx-wheel-ucxx-${{ slice.name }} + + - ${{ each slice in parameters.docs_slices }}: + - job: ucxx_docs_build_${{ slice.name }} + workspace: + clean: all + pool: + name: MLNX + demands: ${{ slice.demands }} + displayName: 'UCXX docs-build (${{ slice.name }})' + container: ${{ parameters.conda_container }} + timeoutInMinutes: 60 + dependsOn: + - ucxx_conda_cpp_build_${{ slice.cpp_slice }} + - ucxx_conda_python_build_${{ slice.python_slice }} + variables: + RAPIDS_CUDA_VERSION: ${{ slice.rapids_cuda_version }} + RAPIDS_PY_VERSION: ${{ slice.rapids_py_version }} + DOCS_OUT_DIR: $(Build.ArtifactStagingDirectory)/docs + steps: + - checkout: self + path: ucx + fetchDepth: 100 + retryCountOnTaskFailure: 5 + - checkout: ucxx + path: ucxx + retryCountOnTaskFailure: 5 + - task: DownloadBuildArtifacts@1 + displayName: Fetch conda-cpp artifact + inputs: + buildType: current + artifactName: ucxx-conda-cpp-${{ slice.cpp_slice }} + downloadPath: $(System.DefaultWorkingDirectory)/_dl_cpp + - task: DownloadBuildArtifacts@1 + displayName: Fetch conda-python artifact + inputs: + buildType: current + artifactName: ucxx-conda-python-${{ slice.python_slice }} + downloadPath: $(System.DefaultWorkingDirectory)/_dl_py + - bash: bash $(UCX_DIR)/buildlib/tools/build_ucxx.sh docs + displayName: Build UCXX docs + env: + CPP_CHANNEL_DIR: $(System.DefaultWorkingDirectory)/_dl_cpp/ucxx-conda-cpp-${{ slice.cpp_slice }} + PYTHON_CHANNEL_DIR: $(System.DefaultWorkingDirectory)/_dl_py/ucxx-conda-python-${{ slice.python_slice }} + RAPIDS_DOCS_DIR: $(DOCS_OUT_DIR) + - task: PublishBuildArtifacts@1 + displayName: Publish docs artifact + inputs: + pathToPublish: $(DOCS_OUT_DIR) + artifactName: ucxx-docs-${{ slice.name }} + + - ${{ each slice in parameters.wheel_tests_ucxx_slices }}: + - job: ucxx_wheel_tests_ucxx_${{ slice.name }} + workspace: + clean: all + pool: + name: MLNX + demands: ${{ slice.demands }} + displayName: 'UCXX wheel-tests-ucxx (${{ slice.name }})' + container: ${{ slice.container }} + timeoutInMinutes: 90 + dependsOn: + - ucxx_wheel_libucxx_build_${{ slice.libucxx_slice }} + - ucxx_wheel_ucxx_build_${{ slice.ucxx_slice }} + variables: + RAPIDS_CUDA_VERSION: ${{ slice.rapids_cuda_version }} + RAPIDS_PY_VERSION: ${{ slice.rapids_py_version }} + LIBUCXX_WHL_DIR: $(Build.ArtifactStagingDirectory)/libucxx_whl + UCXX_WHL_DIR: $(Build.ArtifactStagingDirectory)/ucxx_whl + steps: + - checkout: self + path: ucx + fetchDepth: 100 + retryCountOnTaskFailure: 5 + - checkout: ucxx + path: ucxx + retryCountOnTaskFailure: 5 + - task: DownloadBuildArtifacts@1 + displayName: Fetch libucxx wheel artifact + inputs: + buildType: current + artifactName: ucxx-wheel-libucxx-${{ slice.libucxx_slice }} + downloadPath: $(System.DefaultWorkingDirectory)/_dl_libucxx + - task: DownloadBuildArtifacts@1 + displayName: Fetch ucxx wheel artifact + inputs: + buildType: current + artifactName: ucxx-wheel-ucxx-${{ slice.ucxx_slice }} + downloadPath: $(System.DefaultWorkingDirectory)/_dl_ucxx + - bash: | + rm -rf "$(LIBUCXX_WHL_DIR)" "$(UCXX_WHL_DIR)" + mv "$(System.DefaultWorkingDirectory)/_dl_libucxx/ucxx-wheel-libucxx-${{ slice.libucxx_slice }}" "$(LIBUCXX_WHL_DIR)" + mv "$(System.DefaultWorkingDirectory)/_dl_ucxx/ucxx-wheel-ucxx-${{ slice.ucxx_slice }}" "$(UCXX_WHL_DIR)" + displayName: Stage wheels + - bash: bash $(UCX_DIR)/buildlib/tools/test_ucxx.sh test_wheel_ucxx + displayName: Run UCXX wheel tests + + - ${{ each slice in parameters.devcontainer_slices }}: + - job: ucxx_devcontainer_${{ slice.name }} + workspace: + clean: all + pool: + name: MLNX + demands: ${{ slice.demands }} + displayName: 'UCXX devcontainer (${{ slice.name }})' + container: ${{ parameters.conda_container }} + timeoutInMinutes: 15 + dependsOn: [] + variables: + RAPIDS_CUDA_VERSION: ${{ slice.rapids_cuda_version }} + RAPIDS_PY_VERSION: ${{ slice.rapids_py_version }} + steps: + - checkout: self + path: ucx + fetchDepth: 100 + retryCountOnTaskFailure: 5 + - checkout: ucxx + path: ucxx + retryCountOnTaskFailure: 5 + - bash: bash $(UCX_DIR)/buildlib/tools/build_ucxx.sh devcontainer + displayName: Validate UCXX devcontainer configs diff --git a/buildlib/pr/ucxx_tests.yml b/buildlib/pr/ucxx_tests.yml new file mode 100644 index 00000000000..f5e83e7babf --- /dev/null +++ b/buildlib/pr/ucxx_tests.yml @@ -0,0 +1,48 @@ +parameters: + dependsOn: [Static_check] + cpu_container: ucxx_rapidsai_ci_conda + gpu_container: ucxx_rapidsai_ci_conda_gpu + slices: [] + +stages: + - stage: UCXX_tests + dependsOn: ${{ parameters.dependsOn }} + variables: + UCX_DIR: $(Agent.BuildDirectory)/ucx + UCXX_DIR: $(Agent.BuildDirectory)/ucxx + jobs: + - ${{ each slice in parameters.slices }}: + - job: ucxx_tests_${{ slice.name }} + workspace: + clean: all + pool: + name: MLNX + demands: ${{ slice.demands }} + ${{ if eq(slice.gpu, true) }}: + displayName: 'UCXX GPU tests (${{ slice.name }})' + container: ${{ parameters.gpu_container }} + timeoutInMinutes: 120 + ${{ if eq(slice.gpu, false) }}: + displayName: 'UCXX tests (${{ slice.name }})' + container: ${{ parameters.cpu_container }} + timeoutInMinutes: 90 + variables: + IS_GPU: ${{ slice.gpu }} + RAPIDS_CUDA_VERSION: ${{ slice.rapids_cuda_version }} + RAPIDS_PY_VERSION: ${{ slice.rapids_py_version }} + steps: + - checkout: self + path: ucx + fetchDepth: 100 + retryCountOnTaskFailure: 5 + - checkout: ucxx + path: ucxx + retryCountOnTaskFailure: 5 + - bash: bash $(UCX_DIR)/buildlib/tools/test_ucxx.sh build + displayName: Build UCXX + - bash: bash $(UCX_DIR)/buildlib/tools/test_ucxx.sh test_cpp + displayName: Run UCXX C++ tests + # GPU slices only: cupy variants need a real device. + - ${{ if eq(slice.gpu, true) }}: + - bash: bash $(UCX_DIR)/buildlib/tools/test_ucxx.sh test_python + displayName: Run UCXX Python tests From 24623e9bd37cfcea35e2917987c3711f4d87abc9 Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Wed, 3 Jun 2026 00:00:08 +0300 Subject: [PATCH 4/7] AZP: pin rapidsai/ucxx to immutable tag v0.51.00a Each UCX PR must test a fixed UCXX revision; refs/heads/main drifts, so a green run says nothing durable. Pin to a tag and bump it deliberately as new UCXX releases are validated. --- buildlib/azure-pipelines-pr.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildlib/azure-pipelines-pr.yml b/buildlib/azure-pipelines-pr.yml index 44ccf60fa5f..138f6178e71 100644 --- a/buildlib/azure-pipelines-pr.yml +++ b/buildlib/azure-pipelines-pr.yml @@ -37,7 +37,7 @@ resources: type: github name: rapidsai/ucxx endpoint: Mellanox-lab - ref: refs/heads/main + ref: refs/tags/v0.51.00a extends: template: pr/main.yml From c668a972f41c7eb638d419265c0dc466d3360259 Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Mon, 8 Jun 2026 13:24:01 +0300 Subject: [PATCH 5/7] BUILD: bump RAPIDS CI images 26.06 to 26.08 RAPIDS 26.06 shipped; ToT and the base images we wrap moved to 26.08. --- buildlib/dockers/rapidsai-ci-conda.Dockerfile | 2 +- buildlib/dockers/rapidsai-ci-wheel.Dockerfile | 2 +- buildlib/pr/main.yml | 12 ++++++------ 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/buildlib/dockers/rapidsai-ci-conda.Dockerfile b/buildlib/dockers/rapidsai-ci-conda.Dockerfile index 3dc3e032466..fa0921d6264 100644 --- a/buildlib/dockers/rapidsai-ci-conda.Dockerfile +++ b/buildlib/dockers/rapidsai-ci-conda.Dockerfile @@ -1,7 +1,7 @@ # Azure wrapper around rapidsai/ci-conda: chmod /opt/conda so the non-root UID Azure runs # steps as can use conda/python (rapidsai owns it as root); + adds gdb for stack capture. -ARG BASE_IMAGE=rapidsai/ci-conda:26.06-latest +ARG BASE_IMAGE=rapidsai/ci-conda:26.08-latest FROM ${BASE_IMAGE} RUN chmod -R o+rwX /opt/conda \ diff --git a/buildlib/dockers/rapidsai-ci-wheel.Dockerfile b/buildlib/dockers/rapidsai-ci-wheel.Dockerfile index 99dfd2bf90d..8498d22f3b0 100644 --- a/buildlib/dockers/rapidsai-ci-wheel.Dockerfile +++ b/buildlib/dockers/rapidsai-ci-wheel.Dockerfile @@ -2,7 +2,7 @@ # steps as can write there (rapidsai owns it as root); + adds gdb for stack capture. # Default base = cuda13; cuda12 image built by overriding BASE_IMAGE to the cuda12.9.1 tag. -ARG BASE_IMAGE=rapidsai/ci-wheel:26.06-cuda13.2.0-rockylinux8-py3.11 +ARG BASE_IMAGE=rapidsai/ci-wheel:26.08-cuda13.2.0-rockylinux8-py3.11 FROM ${BASE_IMAGE} RUN chmod -R o+rwX /pyenv \ diff --git a/buildlib/pr/main.yml b/buildlib/pr/main.yml index f4b667fb0c9..b4f39a3cf20 100644 --- a/buildlib/pr/main.yml +++ b/buildlib/pr/main.yml @@ -263,24 +263,24 @@ resources: options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) - container: ucxx_rapidsai_ci_conda # Thin wrapper of rapidsai/ci-conda; see buildlib/dockers/rapidsai-ci-conda.Dockerfile. - image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-conda:26.06-azp-1 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-conda:26.08-azp-1 options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) - container: ucxx_rapidsai_ci_conda_gpu # No IB/host-net: with IB, UCX binds rc_mlx5 and the AM/tag tests hang here. - image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-conda:26.06-azp-1 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-conda:26.08-azp-1 options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) --gpus all --ipc=host # Wheel images are CUDA-pinned: one base per CUDA version (see rapidsai-ci-wheel.Dockerfile). - container: ucxx_rapidsai_ci_wheel_cuda13 - image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-wheel:26.06-cuda13-azp-1 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-wheel:26.08-cuda13-azp-1 options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) - container: ucxx_rapidsai_ci_wheel_cuda13_gpu - image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-wheel:26.06-cuda13-azp-1 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-wheel:26.08-cuda13-azp-1 options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) --gpus all --ipc=host - container: ucxx_rapidsai_ci_wheel_cuda12 - image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-wheel:26.06-cuda12-azp-1 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-wheel:26.08-cuda12-azp-1 options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) - container: ucxx_rapidsai_ci_wheel_cuda12_gpu - image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-wheel:26.06-cuda12-azp-1 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-wheel:26.08-cuda12-azp-1 options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) --gpus all --ipc=host stages: From ac9f7b1a13f795c3823cc0f2c5b2d941482c129f Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Mon, 8 Jun 2026 16:27:30 +0300 Subject: [PATCH 6/7] AZP: target ucxx main commit instead of release tag Pin the rapidsai/ucxx resource to a specific main commit (33deb0b) rather than v0.51.00a. Alpha tags are cut at code-freeze and don't pick up ongoing main work, and an old tag drifts from RAPIDS CI updates (images, ci/ scripts) that must move in tandem. A pinned commit stays immutable/reproducible while letting us do controlled bumps. This commit already includes ucxx #674, so drop the local patch in build_ucxx.sh + test_ucxx.sh. --- buildlib/azure-pipelines-pr.yml | 2 +- buildlib/tools/build_ucxx.sh | 7 ------- buildlib/tools/test_ucxx.sh | 4 ---- 3 files changed, 1 insertion(+), 12 deletions(-) diff --git a/buildlib/azure-pipelines-pr.yml b/buildlib/azure-pipelines-pr.yml index 138f6178e71..abdf8e2a27f 100644 --- a/buildlib/azure-pipelines-pr.yml +++ b/buildlib/azure-pipelines-pr.yml @@ -37,7 +37,7 @@ resources: type: github name: rapidsai/ucxx endpoint: Mellanox-lab - ref: refs/tags/v0.51.00a + ref: 33deb0b581b78027730e8ef86ed32efbb22d0dd8 extends: template: pr/main.yml diff --git a/buildlib/tools/build_ucxx.sh b/buildlib/tools/build_ucxx.sh index b4f4ffdf0ed..4c60f85be26 100755 --- a/buildlib/tools/build_ucxx.sh +++ b/buildlib/tools/build_ucxx.sh @@ -79,13 +79,6 @@ if [[ "$phase" == wheel_* ]]; then export CC="$toolset/gcc" CXX="$toolset/g++" fi -# Upstream ucxx header uses usleep() but omits ; undeclared on -# newer gcc. Affects all C++ phases. -if [[ "$phase" != docs ]]; then - hdr=python/ucxx/ucxx/examples/python_future_task.h - grep -q "include " "$hdr" || sed -i '/^#pragma once/a #include ' "$hdr" -fi - case "$phase" in conda_cpp) bash ci/build_cpp.sh ;; conda_python) bash ci/build_python.sh ;; diff --git a/buildlib/tools/test_ucxx.sh b/buildlib/tools/test_ucxx.sh index cc6d2aaeca2..b398b8beeb3 100755 --- a/buildlib/tools/test_ucxx.sh +++ b/buildlib/tools/test_ucxx.sh @@ -49,10 +49,6 @@ done case "$phase" in build) - # Upstream ucxx examples header uses usleep() but omits ; - # undeclared on newer gcc. Same patch as build_ucxx.sh. - hdr=python/ucxx/ucxx/examples/python_future_task.h - grep -q "include " "$hdr" || sed -i '/^#pragma once/a #include ' "$hdr" if [ "${IS_GPU,,}" = "true" ]; then # sccache wrapper crashes CMake's compiler probe on the GPU build hosts; no-op it. cat > "$HOME/.local/bin/rapids-configure-sccache" <<'EOF' From 48d6dd7ecfbf8b66205cb403e95faf8e21f396db Mon Sep 17 00:00:00 2001 From: Alexey Rivkin Date: Wed, 3 Jun 2026 11:08:01 +0300 Subject: [PATCH 7/7] AZP/RELEASE: build + publish UCXX in the UCX release pipeline Extend the release pipeline to build UCXX conda + libucxx/ucxx wheels + docs on a UCX release tag and on a daily cron, reusing the PR pipeline's UCXX scripts and CUDA-pinned images. Upload/publish steps (anaconda.org for conda+wheels, S3 for docs) are wired but gated `condition: false` until the rapids tokens are provisioned in the Azure secret store - each step's TODO names its secret. Builds on the PR-pipeline work: per-slice wheel container (cuda12/cuda13), py3.11 wheels, ucxx pinned to v0.51.00a, and the host-driver override that the wheel image needs. PR pipeline unchanged. --- buildlib/azure-pipelines-release.yml | 69 +++++++++++++++ buildlib/pr/ucxx_build.yml | 122 ++++++++++++++++++++++++++- 2 files changed, 190 insertions(+), 1 deletion(-) diff --git a/buildlib/azure-pipelines-release.yml b/buildlib/azure-pipelines-release.yml index 028f3f29d32..b2700488ac5 100644 --- a/buildlib/azure-pipelines-release.yml +++ b/buildlib/azure-pipelines-release.yml @@ -9,10 +9,31 @@ pr: - master - v*.*.x +# UCXX nightly: rebuild same SHA daily and publish alpha-versioned packages +# to rapidsai-nightly channel. Tune cron time to avoid RAPIDS GHA window +# during parallel-validation phase. +schedules: + - cron: '0 7 * * *' + displayName: UCXX nightly + branches: + include: [master] + always: true + variables: DOCKER_OPT_VOLUMES: -v /hpc/local:/hpc/local + DOCKER_OPT_ARGS: --cap-add=SYS_PTRACE + # UCXX publish: initial dry-run channel until cutover. Flip to empty + # (use 'main' default) when atomic cutover from RAPIDS GHA is approved. + RAPIDS_CONDA_UPLOAD_LABEL: blossom-test resources: + repositories: + - repository: ucxx + type: github + name: rapidsai/ucxx + endpoint: Mellanox-lab + ref: 33deb0b581b78027730e8ef86ed32efbb22d0dd8 + containers: # x86_64 - container: centos7_cuda11_x86_64 @@ -78,6 +99,18 @@ resources: image: rdmz-harbor.rdmz.labs.mlnx/ucx/aarch64/rocky9-mofed24.10-cuda13:2 options: $(DOCKER_OPT_VOLUMES) + # UCXX release: CPU-only conda + wheel builders (mirrors PR pipeline). + # Wheel images are CUDA-pinned: one base per CUDA version (see rapidsai-ci-wheel.Dockerfile). + - container: ucxx_rapidsai_ci_conda + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-conda:26.08-azp-1 + options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) + - container: ucxx_rapidsai_ci_wheel_cuda12 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-wheel:26.08-cuda12-azp-1 + options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) + - container: ucxx_rapidsai_ci_wheel_cuda13 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-wheel:26.08-cuda13-azp-1 + options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) + stages: - stage: Prepare jobs: @@ -150,3 +183,39 @@ stages: container: centos8_cuda11_aarch64 demands: ucx-arm64 target: publish-release + + - template: pr/ucxx_build.yml + parameters: + dependsOn: [Prepare] + # Fires on UCX release tag (CheckRelease.Launch=True) OR nightly cron + # (Build.Reason=Schedule). Upload steps dispatch token/channel/label at + # runtime based on BUILD_REASON. + condition: | + or( + eq(dependencies.Prepare.outputs['CheckRelease.Result.Launch'], 'True'), + eq(variables['Build.Reason'], 'Schedule') + ) + conda_cpp_slices: + - { name: x86_64_cuda12_py311, demands: ucx_docker, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: x86_64_cuda13_py311, demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + - { name: aarch64_cuda12_py311, demands: ucx_arm64, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: aarch64_cuda13_py311, demands: ucx_arm64, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + conda_python_slices: + - { name: x86_64_cuda12_py311, demands: ucx_docker, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: x86_64_cuda13_py311, demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + - { name: aarch64_cuda12_py311, demands: ucx_arm64, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: aarch64_cuda13_py311, demands: ucx_arm64, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + # libucxx + ucxx wheels for cuda12 + cuda13, x86_64 + aarch64. + wheel_libucxx_slices: + - { name: x86_64_cuda12_py311, container: ucxx_rapidsai_ci_wheel_cuda12, demands: ucx_docker, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: x86_64_cuda13_py311, container: ucxx_rapidsai_ci_wheel_cuda13, demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + - { name: aarch64_cuda12_py311, container: ucxx_rapidsai_ci_wheel_cuda12, demands: ucx_arm64, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: aarch64_cuda13_py311, container: ucxx_rapidsai_ci_wheel_cuda13, demands: ucx_arm64, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + wheel_ucxx_slices: + - { name: x86_64_cuda12_py311, container: ucxx_rapidsai_ci_wheel_cuda12, libucxx_slice: x86_64_cuda12_py311, demands: ucx_docker, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: x86_64_cuda13_py311, container: ucxx_rapidsai_ci_wheel_cuda13, libucxx_slice: x86_64_cuda13_py311, demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + - { name: aarch64_cuda12_py311, container: ucxx_rapidsai_ci_wheel_cuda12, libucxx_slice: aarch64_cuda12_py311, demands: ucx_arm64, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: aarch64_cuda13_py311, container: ucxx_rapidsai_ci_wheel_cuda13, libucxx_slice: aarch64_cuda13_py311, demands: ucx_arm64, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + docs_slices: + - { name: x86_64_cuda13_py311, cpp_slice: x86_64_cuda13_py311, python_slice: x86_64_cuda13_py311, + demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } diff --git a/buildlib/pr/ucxx_build.yml b/buildlib/pr/ucxx_build.yml index 39757bbe66d..54197521eb7 100644 --- a/buildlib/pr/ucxx_build.yml +++ b/buildlib/pr/ucxx_build.yml @@ -1,5 +1,6 @@ parameters: dependsOn: [Static_check] + condition: succeeded() conda_container: ucxx_rapidsai_ci_conda conda_cpp_slices: [] conda_python_slices: [] @@ -11,7 +12,9 @@ parameters: stages: - stage: UCXX_build + displayName: 'UCXX build + publish' dependsOn: ${{ parameters.dependsOn }} + condition: ${{ parameters.condition }} variables: UCX_DIR: $(Agent.BuildDirectory)/ucx UCXX_DIR: $(Agent.BuildDirectory)/ucxx @@ -45,6 +48,31 @@ stages: inputs: pathToPublish: $(RAPIDS_BLD_OUTPUT_DIR) artifactName: ucxx-conda-cpp-${{ slice.name }} + # TODO cutover: provision CONDA_RAPIDSAI_TOKEN (release tag) + # and CONDA_RAPIDSAI_NIGHTLY_TOKEN (cron/ResourceTrigger) in Azure + # secret store, then delete `condition: false`. + - bash: | + set -eEo pipefail + case "$BUILD_REASON" in + IndividualCI) tok="$RELEASE_TOK" ;; + Schedule|ResourceTrigger) tok="$NIGHTLY_TOK" ;; + *) echo "unexpected Build.Reason=$BUILD_REASON" >&2; exit 1 ;; + esac + shopt -s nullglob + pkgs=("$(RAPIDS_BLD_OUTPUT_DIR)"/*/*.conda "$(RAPIDS_BLD_OUTPUT_DIR)"/*/*.tar.bz2) + if [ ${#pkgs[@]} -eq 0 ]; then + echo "ERROR: no conda packages found under $(RAPIDS_BLD_OUTPUT_DIR)" >&2 + exit 1 + fi + anaconda -t "$tok" upload \ + --label "${RAPIDS_CONDA_UPLOAD_LABEL:-main}" \ + --skip-existing --no-progress \ + "${pkgs[@]}" + displayName: Upload conda-cpp to anaconda.org + condition: false + env: + RELEASE_TOK: $(CONDA_RAPIDSAI_TOKEN) + NIGHTLY_TOK: $(CONDA_RAPIDSAI_NIGHTLY_TOKEN) - ${{ each slice in parameters.conda_python_slices }}: - job: ucxx_conda_python_build_${{ slice.name }} @@ -85,6 +113,31 @@ stages: inputs: pathToPublish: $(RAPIDS_BLD_OUTPUT_DIR) artifactName: ucxx-conda-python-${{ slice.name }} + # TODO cutover: provision CONDA_RAPIDSAI_TOKEN (release tag) + # and CONDA_RAPIDSAI_NIGHTLY_TOKEN (cron/ResourceTrigger) in Azure + # secret store, then delete `condition: false`. + - bash: | + set -eEo pipefail + case "$BUILD_REASON" in + IndividualCI) tok="$RELEASE_TOK" ;; + Schedule|ResourceTrigger) tok="$NIGHTLY_TOK" ;; + *) echo "unexpected Build.Reason=$BUILD_REASON" >&2; exit 1 ;; + esac + shopt -s nullglob + pkgs=("$(RAPIDS_BLD_OUTPUT_DIR)"/*/*.conda "$(RAPIDS_BLD_OUTPUT_DIR)"/*/*.tar.bz2) + if [ ${#pkgs[@]} -eq 0 ]; then + echo "ERROR: no conda packages found under $(RAPIDS_BLD_OUTPUT_DIR)" >&2 + exit 1 + fi + anaconda -t "$tok" upload \ + --label "${RAPIDS_CONDA_UPLOAD_LABEL:-main}" \ + --skip-existing --no-progress \ + "${pkgs[@]}" + displayName: Upload conda-python to anaconda.org + condition: false + env: + RELEASE_TOK: $(CONDA_RAPIDSAI_TOKEN) + NIGHTLY_TOK: $(CONDA_RAPIDSAI_NIGHTLY_TOKEN) - ${{ each slice in parameters.wheel_libucxx_slices }}: - job: ucxx_wheel_libucxx_build_${{ slice.name }} @@ -114,6 +167,32 @@ stages: inputs: pathToPublish: $(RAPIDS_BLD_OUTPUT_DIR) artifactName: ucxx-wheel-libucxx-${{ slice.name }} + # TODO cutover: provision CONDA_RAPIDSAI_WHEELS_NIGHTLY_TOKEN + # in Azure secret store, then delete `condition: false`. (Upstream + # uses the same wheels token for both release and nightly uploads.) + # ci-wheel image is pyenv-based: anaconda-client not preinstalled. + - bash: | + set -eEo pipefail + case "$BUILD_REASON" in + IndividualCI|Schedule|ResourceTrigger) : ;; + *) echo "unexpected Build.Reason=$BUILD_REASON" >&2; exit 1 ;; + esac + pip install --user --quiet anaconda-client + export PATH="$HOME/.local/bin:$PATH" + shopt -s nullglob + whls=("$(RAPIDS_BLD_OUTPUT_DIR)"/*.whl) + if [ ${#whls[@]} -eq 0 ]; then + echo "ERROR: no wheels found under $(RAPIDS_BLD_OUTPUT_DIR)" >&2 + exit 1 + fi + anaconda -t "$WHEEL_TOKEN" upload \ + --label "${RAPIDS_CONDA_UPLOAD_LABEL:-main}" \ + --skip-existing --no-progress \ + "${whls[@]}" + displayName: Upload libucxx wheel to anaconda.org + condition: false + env: + WHEEL_TOKEN: $(CONDA_RAPIDSAI_WHEELS_NIGHTLY_TOKEN) - ${{ each slice in parameters.wheel_ucxx_slices }}: - job: ucxx_wheel_ucxx_build_${{ slice.name }} @@ -129,7 +208,7 @@ stages: variables: RAPIDS_CUDA_VERSION: ${{ slice.rapids_cuda_version }} RAPIDS_PY_VERSION: ${{ slice.rapids_py_version }} - WHEEL_INPUT_DIR: $(System.DefaultWorkingDirectory)/wheel-libucxx + WHEEL_INPUT_DIR: $(Build.ArtifactStagingDirectory)/wheel-libucxx steps: - checkout: self path: ucx @@ -155,6 +234,32 @@ stages: inputs: pathToPublish: $(RAPIDS_BLD_OUTPUT_DIR) artifactName: ucxx-wheel-ucxx-${{ slice.name }} + # TODO cutover: provision CONDA_RAPIDSAI_WHEELS_NIGHTLY_TOKEN + # in Azure secret store, then delete `condition: false`. (Upstream + # uses the same wheels token for both release and nightly uploads.) + # ci-wheel image is pyenv-based: anaconda-client not preinstalled. + - bash: | + set -eEo pipefail + case "$BUILD_REASON" in + IndividualCI|Schedule|ResourceTrigger) : ;; + *) echo "unexpected Build.Reason=$BUILD_REASON" >&2; exit 1 ;; + esac + pip install --user --quiet anaconda-client + export PATH="$HOME/.local/bin:$PATH" + shopt -s nullglob + whls=("$(RAPIDS_BLD_OUTPUT_DIR)"/*.whl) + if [ ${#whls[@]} -eq 0 ]; then + echo "ERROR: no wheels found under $(RAPIDS_BLD_OUTPUT_DIR)" >&2 + exit 1 + fi + anaconda -t "$WHEEL_TOKEN" upload \ + --label "${RAPIDS_CONDA_UPLOAD_LABEL:-main}" \ + --skip-existing --no-progress \ + "${whls[@]}" + displayName: Upload ucxx wheel to anaconda.org + condition: false + env: + WHEEL_TOKEN: $(CONDA_RAPIDSAI_WHEELS_NIGHTLY_TOKEN) - ${{ each slice in parameters.docs_slices }}: - job: ucxx_docs_build_${{ slice.name }} @@ -204,6 +309,21 @@ stages: inputs: pathToPublish: $(DOCS_OUT_DIR) artifactName: ucxx-docs-${{ slice.name }} + # TODO cutover: provision RAPIDS_AWS_KEY/RAPIDS_AWS_SECRET in Azure + # secret store, then delete `condition: false`. + - bash: | + set -eEo pipefail + ver=$(head -1 "$(UCXX_DIR)/VERSION" | sed -E 's/^([0-9]+)\.([0-9]+).*/\1.\2/') + if [ -z "$ver" ]; then + echo "ERROR: could not parse MAJOR.MINOR from $(UCXX_DIR)/VERSION" >&2 + exit 1 + fi + aws s3 sync "$(DOCS_OUT_DIR)/" "s3://rapidsai-docs/ucxx/${ver}/" + displayName: Upload docs to S3 + condition: false + env: + AWS_ACCESS_KEY_ID: $(RAPIDS_AWS_KEY) + AWS_SECRET_ACCESS_KEY: $(RAPIDS_AWS_SECRET) - ${{ each slice in parameters.wheel_tests_ucxx_slices }}: - job: ucxx_wheel_tests_ucxx_${{ slice.name }}