diff --git a/buildlib/azure-pipelines-pr.yml b/buildlib/azure-pipelines-pr.yml index d446fe72ea7..abdf8e2a27f 100644 --- a/buildlib/azure-pipelines-pr.yml +++ b/buildlib/azure-pipelines-pr.yml @@ -31,5 +31,13 @@ pr: - buildlib/tools/perf_results.py - buildlib/tools/perf-common.yml +resources: + repositories: + - repository: ucxx + type: github + name: rapidsai/ucxx + endpoint: Mellanox-lab + ref: 33deb0b581b78027730e8ef86ed32efbb22d0dd8 + extends: template: pr/main.yml diff --git a/buildlib/azure-pipelines-release.yml b/buildlib/azure-pipelines-release.yml index 028f3f29d32..b2700488ac5 100644 --- a/buildlib/azure-pipelines-release.yml +++ b/buildlib/azure-pipelines-release.yml @@ -9,10 +9,31 @@ pr: - master - v*.*.x +# UCXX nightly: rebuild same SHA daily and publish alpha-versioned packages +# to rapidsai-nightly channel. Tune cron time to avoid RAPIDS GHA window +# during parallel-validation phase. +schedules: + - cron: '0 7 * * *' + displayName: UCXX nightly + branches: + include: [master] + always: true + variables: DOCKER_OPT_VOLUMES: -v /hpc/local:/hpc/local + DOCKER_OPT_ARGS: --cap-add=SYS_PTRACE + # UCXX publish: initial dry-run channel until cutover. Flip to empty + # (use 'main' default) when atomic cutover from RAPIDS GHA is approved. + RAPIDS_CONDA_UPLOAD_LABEL: blossom-test resources: + repositories: + - repository: ucxx + type: github + name: rapidsai/ucxx + endpoint: Mellanox-lab + ref: 33deb0b581b78027730e8ef86ed32efbb22d0dd8 + containers: # x86_64 - container: centos7_cuda11_x86_64 @@ -78,6 +99,18 @@ resources: image: rdmz-harbor.rdmz.labs.mlnx/ucx/aarch64/rocky9-mofed24.10-cuda13:2 options: $(DOCKER_OPT_VOLUMES) + # UCXX release: CPU-only conda + wheel builders (mirrors PR pipeline). + # Wheel images are CUDA-pinned: one base per CUDA version (see rapidsai-ci-wheel.Dockerfile). + - container: ucxx_rapidsai_ci_conda + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-conda:26.08-azp-1 + options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) + - container: ucxx_rapidsai_ci_wheel_cuda12 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-wheel:26.08-cuda12-azp-1 + options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) + - container: ucxx_rapidsai_ci_wheel_cuda13 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-wheel:26.08-cuda13-azp-1 + options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) + stages: - stage: Prepare jobs: @@ -150,3 +183,39 @@ stages: container: centos8_cuda11_aarch64 demands: ucx-arm64 target: publish-release + + - template: pr/ucxx_build.yml + parameters: + dependsOn: [Prepare] + # Fires on UCX release tag (CheckRelease.Launch=True) OR nightly cron + # (Build.Reason=Schedule). Upload steps dispatch token/channel/label at + # runtime based on BUILD_REASON. + condition: | + or( + eq(dependencies.Prepare.outputs['CheckRelease.Result.Launch'], 'True'), + eq(variables['Build.Reason'], 'Schedule') + ) + conda_cpp_slices: + - { name: x86_64_cuda12_py311, demands: ucx_docker, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: x86_64_cuda13_py311, demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + - { name: aarch64_cuda12_py311, demands: ucx_arm64, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: aarch64_cuda13_py311, demands: ucx_arm64, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + conda_python_slices: + - { name: x86_64_cuda12_py311, demands: ucx_docker, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: x86_64_cuda13_py311, demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + - { name: aarch64_cuda12_py311, demands: ucx_arm64, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: aarch64_cuda13_py311, demands: ucx_arm64, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + # libucxx + ucxx wheels for cuda12 + cuda13, x86_64 + aarch64. + wheel_libucxx_slices: + - { name: x86_64_cuda12_py311, container: ucxx_rapidsai_ci_wheel_cuda12, demands: ucx_docker, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: x86_64_cuda13_py311, container: ucxx_rapidsai_ci_wheel_cuda13, demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + - { name: aarch64_cuda12_py311, container: ucxx_rapidsai_ci_wheel_cuda12, demands: ucx_arm64, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: aarch64_cuda13_py311, container: ucxx_rapidsai_ci_wheel_cuda13, demands: ucx_arm64, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + wheel_ucxx_slices: + - { name: x86_64_cuda12_py311, container: ucxx_rapidsai_ci_wheel_cuda12, libucxx_slice: x86_64_cuda12_py311, demands: ucx_docker, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: x86_64_cuda13_py311, container: ucxx_rapidsai_ci_wheel_cuda13, libucxx_slice: x86_64_cuda13_py311, demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + - { name: aarch64_cuda12_py311, container: ucxx_rapidsai_ci_wheel_cuda12, libucxx_slice: aarch64_cuda12_py311, demands: ucx_arm64, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: aarch64_cuda13_py311, container: ucxx_rapidsai_ci_wheel_cuda13, libucxx_slice: aarch64_cuda13_py311, demands: ucx_arm64, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + docs_slices: + - { name: x86_64_cuda13_py311, cpp_slice: x86_64_cuda13_py311, python_slice: x86_64_cuda13_py311, + demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } diff --git a/buildlib/dockers/rapidsai-ci-conda.Dockerfile b/buildlib/dockers/rapidsai-ci-conda.Dockerfile new file mode 100644 index 00000000000..fa0921d6264 --- /dev/null +++ b/buildlib/dockers/rapidsai-ci-conda.Dockerfile @@ -0,0 +1,10 @@ +# Azure wrapper around rapidsai/ci-conda: chmod /opt/conda so the non-root UID Azure runs +# steps as can use conda/python (rapidsai owns it as root); + adds gdb for stack capture. + +ARG BASE_IMAGE=rapidsai/ci-conda:26.08-latest +FROM ${BASE_IMAGE} + +RUN chmod -R o+rwX /opt/conda \ + && apt-get update \ + && apt-get install -y --no-install-recommends gdb \ + && rm -rf /var/lib/apt/lists/* diff --git a/buildlib/dockers/rapidsai-ci-wheel.Dockerfile b/buildlib/dockers/rapidsai-ci-wheel.Dockerfile new file mode 100644 index 00000000000..8498d22f3b0 --- /dev/null +++ b/buildlib/dockers/rapidsai-ci-wheel.Dockerfile @@ -0,0 +1,10 @@ +# Azure wrapper around rapidsai/ci-wheel: chmod /pyenv so the non-root UID Azure runs +# steps as can write there (rapidsai owns it as root); + adds gdb for stack capture. +# Default base = cuda13; cuda12 image built by overriding BASE_IMAGE to the cuda12.9.1 tag. + +ARG BASE_IMAGE=rapidsai/ci-wheel:26.08-cuda13.2.0-rockylinux8-py3.11 +FROM ${BASE_IMAGE} + +RUN chmod -R o+rwX /pyenv \ + && dnf install -y gdb \ + && dnf clean all diff --git a/buildlib/pr/main.yml b/buildlib/pr/main.yml index 0e087877a5b..b4f39a3cf20 100644 --- a/buildlib/pr/main.yml +++ b/buildlib/pr/main.yml @@ -261,6 +261,27 @@ resources: - container: centos10stream image: rdmz-harbor.rdmz.labs.mlnx/hpcx/x86_64/centos10stream/builder:inbox options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) + - container: ucxx_rapidsai_ci_conda + # Thin wrapper of rapidsai/ci-conda; see buildlib/dockers/rapidsai-ci-conda.Dockerfile. + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-conda:26.08-azp-1 + options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) + - container: ucxx_rapidsai_ci_conda_gpu + # No IB/host-net: with IB, UCX binds rc_mlx5 and the AM/tag tests hang here. + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-conda:26.08-azp-1 + options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) --gpus all --ipc=host + # Wheel images are CUDA-pinned: one base per CUDA version (see rapidsai-ci-wheel.Dockerfile). + - container: ucxx_rapidsai_ci_wheel_cuda13 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-wheel:26.08-cuda13-azp-1 + options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) + - container: ucxx_rapidsai_ci_wheel_cuda13_gpu + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-wheel:26.08-cuda13-azp-1 + options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) --gpus all --ipc=host + - container: ucxx_rapidsai_ci_wheel_cuda12 + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-wheel:26.08-cuda12-azp-1 + options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) + - container: ucxx_rapidsai_ci_wheel_cuda12_gpu + image: rdmz-harbor.rdmz.labs.mlnx/ucx/rapidsai-ci-wheel:26.08-cuda12-azp-1 + options: $(DOCKER_OPT_ARGS) $(DOCKER_OPT_VOLUMES) --gpus all --ipc=host stages: - stage: Codestyle @@ -349,6 +370,52 @@ stages: demands: ucx_docker -equals yes container: coverity_rh7 + - template: ucxx_build.yml + parameters: + dependsOn: [Static_check] + # Wheel GPU tests run on x86 only (no arm64 GPU runner); cuda12 + cuda13. + wheel_tests_ucxx_slices: + - { name: x86_64_cuda12_py311, container: ucxx_rapidsai_ci_wheel_cuda12_gpu, libucxx_slice: x86_64_cuda12_py311, ucxx_slice: x86_64_cuda12_py311, + demands: ucx_gpu, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: x86_64_cuda13_py311, container: ucxx_rapidsai_ci_wheel_cuda13_gpu, libucxx_slice: x86_64_cuda13_py311, ucxx_slice: x86_64_cuda13_py311, + demands: ucx_gpu, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + conda_cpp_slices: + - { name: x86_64_cuda12_py311, demands: ucx_docker, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: x86_64_cuda13_py311, demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + - { name: aarch64_cuda12_py311, demands: ucx_arm64, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: aarch64_cuda13_py311, demands: ucx_arm64, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + conda_python_slices: + - { name: x86_64_cuda12_py311, demands: ucx_docker, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: x86_64_cuda13_py311, demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + - { name: aarch64_cuda12_py311, demands: ucx_arm64, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: aarch64_cuda13_py311, demands: ucx_arm64, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + # libucxx + ucxx wheels for cuda12 + cuda13, x86_64 + aarch64. + wheel_libucxx_slices: + - { name: x86_64_cuda12_py311, container: ucxx_rapidsai_ci_wheel_cuda12, demands: ucx_docker, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: x86_64_cuda13_py311, container: ucxx_rapidsai_ci_wheel_cuda13, demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + - { name: aarch64_cuda12_py311, container: ucxx_rapidsai_ci_wheel_cuda12, demands: ucx_arm64, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: aarch64_cuda13_py311, container: ucxx_rapidsai_ci_wheel_cuda13, demands: ucx_arm64, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + wheel_ucxx_slices: + - { name: x86_64_cuda12_py311, container: ucxx_rapidsai_ci_wheel_cuda12, libucxx_slice: x86_64_cuda12_py311, demands: ucx_docker, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: x86_64_cuda13_py311, container: ucxx_rapidsai_ci_wheel_cuda13, libucxx_slice: x86_64_cuda13_py311, demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + - { name: aarch64_cuda12_py311, container: ucxx_rapidsai_ci_wheel_cuda12, libucxx_slice: aarch64_cuda12_py311, demands: ucx_arm64, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: aarch64_cuda13_py311, container: ucxx_rapidsai_ci_wheel_cuda13, libucxx_slice: aarch64_cuda13_py311, demands: ucx_arm64, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + docs_slices: + - { name: x86_64_cuda13_py311, cpp_slice: x86_64_cuda13_py311, python_slice: x86_64_cuda13_py311, + demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + devcontainer_slices: + - { name: x86_64_cuda13_py311, demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + + - template: ucxx_tests.yml + parameters: + dependsOn: [Static_check] + slices: + - { name: x86_64_cuda13_py313, gpu: true, demands: ucx_gpu, rapids_cuda_version: '13.2.0', rapids_py_version: '3.13' } + - { name: x86_64_cuda12_py311, gpu: false, demands: ucx_docker, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: x86_64_cuda13_py311, gpu: false, demands: ucx_docker, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + - { name: aarch64_cuda12_py311, gpu: false, demands: ucx_arm64, rapids_cuda_version: '12.9.1', rapids_py_version: '3.11' } + - { name: aarch64_cuda13_py311, gpu: false, demands: ucx_arm64, rapids_cuda_version: '13.2.0', rapids_py_version: '3.11' } + - stage: Tests dependsOn: [Basic_compile] jobs: diff --git a/buildlib/pr/ucxx_build.yml b/buildlib/pr/ucxx_build.yml new file mode 100644 index 00000000000..54197521eb7 --- /dev/null +++ b/buildlib/pr/ucxx_build.yml @@ -0,0 +1,397 @@ +parameters: + dependsOn: [Static_check] + condition: succeeded() + conda_container: ucxx_rapidsai_ci_conda + conda_cpp_slices: [] + conda_python_slices: [] + wheel_libucxx_slices: [] + wheel_ucxx_slices: [] + wheel_tests_ucxx_slices: [] + docs_slices: [] + devcontainer_slices: [] + +stages: + - stage: UCXX_build + displayName: 'UCXX build + publish' + dependsOn: ${{ parameters.dependsOn }} + condition: ${{ parameters.condition }} + variables: + UCX_DIR: $(Agent.BuildDirectory)/ucx + UCXX_DIR: $(Agent.BuildDirectory)/ucxx + RAPIDS_BLD_OUTPUT_DIR: $(Build.ArtifactStagingDirectory) + jobs: + - ${{ each slice in parameters.conda_cpp_slices }}: + - job: ucxx_conda_cpp_build_${{ slice.name }} + workspace: + clean: all + pool: + name: MLNX + demands: ${{ slice.demands }} + displayName: 'UCXX conda-cpp-build (${{ slice.name }})' + container: ${{ parameters.conda_container }} + timeoutInMinutes: 60 + variables: + RAPIDS_CUDA_VERSION: ${{ slice.rapids_cuda_version }} + RAPIDS_PY_VERSION: ${{ slice.rapids_py_version }} + steps: + - checkout: self + path: ucx + fetchDepth: 100 + retryCountOnTaskFailure: 5 + - checkout: ucxx + path: ucxx + retryCountOnTaskFailure: 5 + - bash: bash $(UCX_DIR)/buildlib/tools/build_ucxx.sh conda_cpp + displayName: Build UCXX conda C++ package + - task: PublishBuildArtifacts@1 + displayName: Publish conda-cpp artifact + inputs: + pathToPublish: $(RAPIDS_BLD_OUTPUT_DIR) + artifactName: ucxx-conda-cpp-${{ slice.name }} + # TODO cutover: provision CONDA_RAPIDSAI_TOKEN (release tag) + # and CONDA_RAPIDSAI_NIGHTLY_TOKEN (cron/ResourceTrigger) in Azure + # secret store, then delete `condition: false`. + - bash: | + set -eEo pipefail + case "$BUILD_REASON" in + IndividualCI) tok="$RELEASE_TOK" ;; + Schedule|ResourceTrigger) tok="$NIGHTLY_TOK" ;; + *) echo "unexpected Build.Reason=$BUILD_REASON" >&2; exit 1 ;; + esac + shopt -s nullglob + pkgs=("$(RAPIDS_BLD_OUTPUT_DIR)"/*/*.conda "$(RAPIDS_BLD_OUTPUT_DIR)"/*/*.tar.bz2) + if [ ${#pkgs[@]} -eq 0 ]; then + echo "ERROR: no conda packages found under $(RAPIDS_BLD_OUTPUT_DIR)" >&2 + exit 1 + fi + anaconda -t "$tok" upload \ + --label "${RAPIDS_CONDA_UPLOAD_LABEL:-main}" \ + --skip-existing --no-progress \ + "${pkgs[@]}" + displayName: Upload conda-cpp to anaconda.org + condition: false + env: + RELEASE_TOK: $(CONDA_RAPIDSAI_TOKEN) + NIGHTLY_TOK: $(CONDA_RAPIDSAI_NIGHTLY_TOKEN) + + - ${{ each slice in parameters.conda_python_slices }}: + - job: ucxx_conda_python_build_${{ slice.name }} + workspace: + clean: all + pool: + name: MLNX + demands: ${{ slice.demands }} + displayName: 'UCXX conda-python-build (${{ slice.name }})' + container: ${{ parameters.conda_container }} + timeoutInMinutes: 60 + dependsOn: ucxx_conda_cpp_build_${{ slice.name }} + variables: + RAPIDS_CUDA_VERSION: ${{ slice.rapids_cuda_version }} + RAPIDS_PY_VERSION: ${{ slice.rapids_py_version }} + steps: + - checkout: self + path: ucx + fetchDepth: 100 + retryCountOnTaskFailure: 5 + - checkout: ucxx + path: ucxx + retryCountOnTaskFailure: 5 + - task: DownloadBuildArtifacts@1 + displayName: Fetch conda-cpp artifact + inputs: + buildType: current + artifactName: ucxx-conda-cpp-${{ slice.name }} + downloadPath: $(System.DefaultWorkingDirectory)/_dl + - bash: | + rm -rf "$(RAPIDS_BLD_OUTPUT_DIR)" + mv "$(System.DefaultWorkingDirectory)/_dl/ucxx-conda-cpp-${{ slice.name }}" "$(RAPIDS_BLD_OUTPUT_DIR)" + displayName: Stage conda-cpp artifact + - bash: bash $(UCX_DIR)/buildlib/tools/build_ucxx.sh conda_python + displayName: Build UCXX conda Python package + - task: PublishBuildArtifacts@1 + displayName: Publish conda-python artifact + inputs: + pathToPublish: $(RAPIDS_BLD_OUTPUT_DIR) + artifactName: ucxx-conda-python-${{ slice.name }} + # TODO cutover: provision CONDA_RAPIDSAI_TOKEN (release tag) + # and CONDA_RAPIDSAI_NIGHTLY_TOKEN (cron/ResourceTrigger) in Azure + # secret store, then delete `condition: false`. + - bash: | + set -eEo pipefail + case "$BUILD_REASON" in + IndividualCI) tok="$RELEASE_TOK" ;; + Schedule|ResourceTrigger) tok="$NIGHTLY_TOK" ;; + *) echo "unexpected Build.Reason=$BUILD_REASON" >&2; exit 1 ;; + esac + shopt -s nullglob + pkgs=("$(RAPIDS_BLD_OUTPUT_DIR)"/*/*.conda "$(RAPIDS_BLD_OUTPUT_DIR)"/*/*.tar.bz2) + if [ ${#pkgs[@]} -eq 0 ]; then + echo "ERROR: no conda packages found under $(RAPIDS_BLD_OUTPUT_DIR)" >&2 + exit 1 + fi + anaconda -t "$tok" upload \ + --label "${RAPIDS_CONDA_UPLOAD_LABEL:-main}" \ + --skip-existing --no-progress \ + "${pkgs[@]}" + displayName: Upload conda-python to anaconda.org + condition: false + env: + RELEASE_TOK: $(CONDA_RAPIDSAI_TOKEN) + NIGHTLY_TOK: $(CONDA_RAPIDSAI_NIGHTLY_TOKEN) + + - ${{ each slice in parameters.wheel_libucxx_slices }}: + - job: ucxx_wheel_libucxx_build_${{ slice.name }} + workspace: + clean: all + pool: + name: MLNX + demands: ${{ slice.demands }} + displayName: 'UCXX wheel-build-libucxx (${{ slice.name }})' + container: ${{ slice.container }} + timeoutInMinutes: 60 + variables: + RAPIDS_CUDA_VERSION: ${{ slice.rapids_cuda_version }} + RAPIDS_PY_VERSION: ${{ slice.rapids_py_version }} + steps: + - checkout: self + path: ucx + fetchDepth: 100 + retryCountOnTaskFailure: 5 + - checkout: ucxx + path: ucxx + retryCountOnTaskFailure: 5 + - bash: bash $(UCX_DIR)/buildlib/tools/build_ucxx.sh wheel_libucxx + displayName: Build libucxx wheel + - task: PublishBuildArtifacts@1 + displayName: Publish libucxx wheel artifact + inputs: + pathToPublish: $(RAPIDS_BLD_OUTPUT_DIR) + artifactName: ucxx-wheel-libucxx-${{ slice.name }} + # TODO cutover: provision CONDA_RAPIDSAI_WHEELS_NIGHTLY_TOKEN + # in Azure secret store, then delete `condition: false`. (Upstream + # uses the same wheels token for both release and nightly uploads.) + # ci-wheel image is pyenv-based: anaconda-client not preinstalled. + - bash: | + set -eEo pipefail + case "$BUILD_REASON" in + IndividualCI|Schedule|ResourceTrigger) : ;; + *) echo "unexpected Build.Reason=$BUILD_REASON" >&2; exit 1 ;; + esac + pip install --user --quiet anaconda-client + export PATH="$HOME/.local/bin:$PATH" + shopt -s nullglob + whls=("$(RAPIDS_BLD_OUTPUT_DIR)"/*.whl) + if [ ${#whls[@]} -eq 0 ]; then + echo "ERROR: no wheels found under $(RAPIDS_BLD_OUTPUT_DIR)" >&2 + exit 1 + fi + anaconda -t "$WHEEL_TOKEN" upload \ + --label "${RAPIDS_CONDA_UPLOAD_LABEL:-main}" \ + --skip-existing --no-progress \ + "${whls[@]}" + displayName: Upload libucxx wheel to anaconda.org + condition: false + env: + WHEEL_TOKEN: $(CONDA_RAPIDSAI_WHEELS_NIGHTLY_TOKEN) + + - ${{ each slice in parameters.wheel_ucxx_slices }}: + - job: ucxx_wheel_ucxx_build_${{ slice.name }} + workspace: + clean: all + pool: + name: MLNX + demands: ${{ slice.demands }} + displayName: 'UCXX wheel-build-ucxx (${{ slice.name }})' + container: ${{ slice.container }} + timeoutInMinutes: 60 + dependsOn: ucxx_wheel_libucxx_build_${{ slice.libucxx_slice }} + variables: + RAPIDS_CUDA_VERSION: ${{ slice.rapids_cuda_version }} + RAPIDS_PY_VERSION: ${{ slice.rapids_py_version }} + WHEEL_INPUT_DIR: $(Build.ArtifactStagingDirectory)/wheel-libucxx + steps: + - checkout: self + path: ucx + fetchDepth: 100 + retryCountOnTaskFailure: 5 + - checkout: ucxx + path: ucxx + retryCountOnTaskFailure: 5 + - task: DownloadBuildArtifacts@1 + displayName: Fetch libucxx wheel artifact + inputs: + buildType: current + artifactName: ucxx-wheel-libucxx-${{ slice.libucxx_slice }} + downloadPath: $(System.DefaultWorkingDirectory)/_dl + - bash: | + rm -rf "$(WHEEL_INPUT_DIR)" + mv "$(System.DefaultWorkingDirectory)/_dl/ucxx-wheel-libucxx-${{ slice.libucxx_slice }}" "$(WHEEL_INPUT_DIR)" + displayName: Stage libucxx wheel artifact + - bash: bash $(UCX_DIR)/buildlib/tools/build_ucxx.sh wheel_ucxx + displayName: Build ucxx wheel + - task: PublishBuildArtifacts@1 + displayName: Publish ucxx wheel artifact + inputs: + pathToPublish: $(RAPIDS_BLD_OUTPUT_DIR) + artifactName: ucxx-wheel-ucxx-${{ slice.name }} + # TODO cutover: provision CONDA_RAPIDSAI_WHEELS_NIGHTLY_TOKEN + # in Azure secret store, then delete `condition: false`. (Upstream + # uses the same wheels token for both release and nightly uploads.) + # ci-wheel image is pyenv-based: anaconda-client not preinstalled. + - bash: | + set -eEo pipefail + case "$BUILD_REASON" in + IndividualCI|Schedule|ResourceTrigger) : ;; + *) echo "unexpected Build.Reason=$BUILD_REASON" >&2; exit 1 ;; + esac + pip install --user --quiet anaconda-client + export PATH="$HOME/.local/bin:$PATH" + shopt -s nullglob + whls=("$(RAPIDS_BLD_OUTPUT_DIR)"/*.whl) + if [ ${#whls[@]} -eq 0 ]; then + echo "ERROR: no wheels found under $(RAPIDS_BLD_OUTPUT_DIR)" >&2 + exit 1 + fi + anaconda -t "$WHEEL_TOKEN" upload \ + --label "${RAPIDS_CONDA_UPLOAD_LABEL:-main}" \ + --skip-existing --no-progress \ + "${whls[@]}" + displayName: Upload ucxx wheel to anaconda.org + condition: false + env: + WHEEL_TOKEN: $(CONDA_RAPIDSAI_WHEELS_NIGHTLY_TOKEN) + + - ${{ each slice in parameters.docs_slices }}: + - job: ucxx_docs_build_${{ slice.name }} + workspace: + clean: all + pool: + name: MLNX + demands: ${{ slice.demands }} + displayName: 'UCXX docs-build (${{ slice.name }})' + container: ${{ parameters.conda_container }} + timeoutInMinutes: 60 + dependsOn: + - ucxx_conda_cpp_build_${{ slice.cpp_slice }} + - ucxx_conda_python_build_${{ slice.python_slice }} + variables: + RAPIDS_CUDA_VERSION: ${{ slice.rapids_cuda_version }} + RAPIDS_PY_VERSION: ${{ slice.rapids_py_version }} + DOCS_OUT_DIR: $(Build.ArtifactStagingDirectory)/docs + steps: + - checkout: self + path: ucx + fetchDepth: 100 + retryCountOnTaskFailure: 5 + - checkout: ucxx + path: ucxx + retryCountOnTaskFailure: 5 + - task: DownloadBuildArtifacts@1 + displayName: Fetch conda-cpp artifact + inputs: + buildType: current + artifactName: ucxx-conda-cpp-${{ slice.cpp_slice }} + downloadPath: $(System.DefaultWorkingDirectory)/_dl_cpp + - task: DownloadBuildArtifacts@1 + displayName: Fetch conda-python artifact + inputs: + buildType: current + artifactName: ucxx-conda-python-${{ slice.python_slice }} + downloadPath: $(System.DefaultWorkingDirectory)/_dl_py + - bash: bash $(UCX_DIR)/buildlib/tools/build_ucxx.sh docs + displayName: Build UCXX docs + env: + CPP_CHANNEL_DIR: $(System.DefaultWorkingDirectory)/_dl_cpp/ucxx-conda-cpp-${{ slice.cpp_slice }} + PYTHON_CHANNEL_DIR: $(System.DefaultWorkingDirectory)/_dl_py/ucxx-conda-python-${{ slice.python_slice }} + RAPIDS_DOCS_DIR: $(DOCS_OUT_DIR) + - task: PublishBuildArtifacts@1 + displayName: Publish docs artifact + inputs: + pathToPublish: $(DOCS_OUT_DIR) + artifactName: ucxx-docs-${{ slice.name }} + # TODO cutover: provision RAPIDS_AWS_KEY/RAPIDS_AWS_SECRET in Azure + # secret store, then delete `condition: false`. + - bash: | + set -eEo pipefail + ver=$(head -1 "$(UCXX_DIR)/VERSION" | sed -E 's/^([0-9]+)\.([0-9]+).*/\1.\2/') + if [ -z "$ver" ]; then + echo "ERROR: could not parse MAJOR.MINOR from $(UCXX_DIR)/VERSION" >&2 + exit 1 + fi + aws s3 sync "$(DOCS_OUT_DIR)/" "s3://rapidsai-docs/ucxx/${ver}/" + displayName: Upload docs to S3 + condition: false + env: + AWS_ACCESS_KEY_ID: $(RAPIDS_AWS_KEY) + AWS_SECRET_ACCESS_KEY: $(RAPIDS_AWS_SECRET) + + - ${{ each slice in parameters.wheel_tests_ucxx_slices }}: + - job: ucxx_wheel_tests_ucxx_${{ slice.name }} + workspace: + clean: all + pool: + name: MLNX + demands: ${{ slice.demands }} + displayName: 'UCXX wheel-tests-ucxx (${{ slice.name }})' + container: ${{ slice.container }} + timeoutInMinutes: 90 + dependsOn: + - ucxx_wheel_libucxx_build_${{ slice.libucxx_slice }} + - ucxx_wheel_ucxx_build_${{ slice.ucxx_slice }} + variables: + RAPIDS_CUDA_VERSION: ${{ slice.rapids_cuda_version }} + RAPIDS_PY_VERSION: ${{ slice.rapids_py_version }} + LIBUCXX_WHL_DIR: $(Build.ArtifactStagingDirectory)/libucxx_whl + UCXX_WHL_DIR: $(Build.ArtifactStagingDirectory)/ucxx_whl + steps: + - checkout: self + path: ucx + fetchDepth: 100 + retryCountOnTaskFailure: 5 + - checkout: ucxx + path: ucxx + retryCountOnTaskFailure: 5 + - task: DownloadBuildArtifacts@1 + displayName: Fetch libucxx wheel artifact + inputs: + buildType: current + artifactName: ucxx-wheel-libucxx-${{ slice.libucxx_slice }} + downloadPath: $(System.DefaultWorkingDirectory)/_dl_libucxx + - task: DownloadBuildArtifacts@1 + displayName: Fetch ucxx wheel artifact + inputs: + buildType: current + artifactName: ucxx-wheel-ucxx-${{ slice.ucxx_slice }} + downloadPath: $(System.DefaultWorkingDirectory)/_dl_ucxx + - bash: | + rm -rf "$(LIBUCXX_WHL_DIR)" "$(UCXX_WHL_DIR)" + mv "$(System.DefaultWorkingDirectory)/_dl_libucxx/ucxx-wheel-libucxx-${{ slice.libucxx_slice }}" "$(LIBUCXX_WHL_DIR)" + mv "$(System.DefaultWorkingDirectory)/_dl_ucxx/ucxx-wheel-ucxx-${{ slice.ucxx_slice }}" "$(UCXX_WHL_DIR)" + displayName: Stage wheels + - bash: bash $(UCX_DIR)/buildlib/tools/test_ucxx.sh test_wheel_ucxx + displayName: Run UCXX wheel tests + + - ${{ each slice in parameters.devcontainer_slices }}: + - job: ucxx_devcontainer_${{ slice.name }} + workspace: + clean: all + pool: + name: MLNX + demands: ${{ slice.demands }} + displayName: 'UCXX devcontainer (${{ slice.name }})' + container: ${{ parameters.conda_container }} + timeoutInMinutes: 15 + dependsOn: [] + variables: + RAPIDS_CUDA_VERSION: ${{ slice.rapids_cuda_version }} + RAPIDS_PY_VERSION: ${{ slice.rapids_py_version }} + steps: + - checkout: self + path: ucx + fetchDepth: 100 + retryCountOnTaskFailure: 5 + - checkout: ucxx + path: ucxx + retryCountOnTaskFailure: 5 + - bash: bash $(UCX_DIR)/buildlib/tools/build_ucxx.sh devcontainer + displayName: Validate UCXX devcontainer configs diff --git a/buildlib/pr/ucxx_tests.yml b/buildlib/pr/ucxx_tests.yml new file mode 100644 index 00000000000..f5e83e7babf --- /dev/null +++ b/buildlib/pr/ucxx_tests.yml @@ -0,0 +1,48 @@ +parameters: + dependsOn: [Static_check] + cpu_container: ucxx_rapidsai_ci_conda + gpu_container: ucxx_rapidsai_ci_conda_gpu + slices: [] + +stages: + - stage: UCXX_tests + dependsOn: ${{ parameters.dependsOn }} + variables: + UCX_DIR: $(Agent.BuildDirectory)/ucx + UCXX_DIR: $(Agent.BuildDirectory)/ucxx + jobs: + - ${{ each slice in parameters.slices }}: + - job: ucxx_tests_${{ slice.name }} + workspace: + clean: all + pool: + name: MLNX + demands: ${{ slice.demands }} + ${{ if eq(slice.gpu, true) }}: + displayName: 'UCXX GPU tests (${{ slice.name }})' + container: ${{ parameters.gpu_container }} + timeoutInMinutes: 120 + ${{ if eq(slice.gpu, false) }}: + displayName: 'UCXX tests (${{ slice.name }})' + container: ${{ parameters.cpu_container }} + timeoutInMinutes: 90 + variables: + IS_GPU: ${{ slice.gpu }} + RAPIDS_CUDA_VERSION: ${{ slice.rapids_cuda_version }} + RAPIDS_PY_VERSION: ${{ slice.rapids_py_version }} + steps: + - checkout: self + path: ucx + fetchDepth: 100 + retryCountOnTaskFailure: 5 + - checkout: ucxx + path: ucxx + retryCountOnTaskFailure: 5 + - bash: bash $(UCX_DIR)/buildlib/tools/test_ucxx.sh build + displayName: Build UCXX + - bash: bash $(UCX_DIR)/buildlib/tools/test_ucxx.sh test_cpp + displayName: Run UCXX C++ tests + # GPU slices only: cupy variants need a real device. + - ${{ if eq(slice.gpu, true) }}: + - bash: bash $(UCX_DIR)/buildlib/tools/test_ucxx.sh test_python + displayName: Run UCXX Python tests diff --git a/buildlib/tools/build_ucxx.sh b/buildlib/tools/build_ucxx.sh new file mode 100755 index 00000000000..4c60f85be26 --- /dev/null +++ b/buildlib/tools/build_ucxx.sh @@ -0,0 +1,97 @@ +#!/bin/bash -eE +# +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# See file LICENSE for terms. +# +# Usage: build_ucxx.sh +# Env: UCXX_DIR (all phases). Build phases also need RAPIDS_CUDA_VERSION, +# RAPIDS_PY_VERSION, RAPIDS_BLD_OUTPUT_DIR. +# wheel_ucxx phase also requires WHEEL_INPUT_DIR (libucxx wheel artifact dir) +# Docs phase env: CPP_CHANNEL_DIR, PYTHON_CHANNEL_DIR, RAPIDS_DOCS_DIR + +phase=${1:?phase required} +: "${UCXX_DIR:?UCXX_DIR required}" + +case "$phase" in + devcontainer) + # Parse each .devcontainer config; verify its Dockerfile + BASE exist + # (no registry pull - the devcontainer CLI catches missing images at use). + UCXX_DIR="$UCXX_DIR" python3 - <<'PY' +import glob, json, os, sys +root = os.environ["UCXX_DIR"] +cfgs = glob.glob(os.path.join(root, ".devcontainer", "*", "devcontainer.json")) +if not cfgs: + sys.exit("ERROR: no devcontainer.json under .devcontainer/") +for cfg in cfgs: + b = json.load(open(cfg))["build"] + df = b["dockerfile"].replace("${localWorkspaceFolder}", root) + assert os.path.isfile(df), f"{cfg}: missing Dockerfile {df}" + assert b["args"]["BASE"], f"{cfg}: empty BASE" + print(f"OK {cfg}") +PY + exit 0 ;; +esac + +: "${RAPIDS_CUDA_VERSION:?RAPIDS_CUDA_VERSION required}" +: "${RAPIDS_PY_VERSION:?RAPIDS_PY_VERSION required}" +: "${RAPIDS_BLD_OUTPUT_DIR:?RAPIDS_BLD_OUTPUT_DIR required}" + +export RAPIDS_CUDA_VERSION RAPIDS_PY_VERSION +mkdir -p "$RAPIDS_BLD_OUTPUT_DIR" + +case "$phase" in + conda_*) export RAPIDS_CONDA_BLD_OUTPUT_DIR="$RAPIDS_BLD_OUTPUT_DIR" ;; + wheel_*) export RAPIDS_WHEEL_BLD_OUTPUT_DIR="$RAPIDS_BLD_OUTPUT_DIR" ;; + docs) + : "${CPP_CHANNEL_DIR:?CPP_CHANNEL_DIR required for docs phase}" + : "${PYTHON_CHANNEL_DIR:?PYTHON_CHANNEL_DIR required for docs phase}" + : "${RAPIDS_DOCS_DIR:?RAPIDS_DOCS_DIR required for docs phase}" + mkdir -p "$RAPIDS_DOCS_DIR" ;; +esac + +mkdir -p "$HOME/.local/bin" +for tool in rapids-download-conda-from-github rapids-download-from-github; do + printf '#!/bin/bash\necho "%s"\n' "$RAPIDS_BLD_OUTPUT_DIR" > "$HOME/.local/bin/$tool" + chmod +x "$HOME/.local/bin/$tool" +done +# Docs phase: override shims to point at the staged conda channels. +if [ "$phase" = "docs" ]; then + printf '#!/bin/bash\necho "%s"\n' "$CPP_CHANNEL_DIR" > "$HOME/.local/bin/rapids-download-conda-from-github" + printf '#!/bin/bash\necho "%s"\n' "$PYTHON_CHANNEL_DIR" > "$HOME/.local/bin/rapids-download-from-github" +fi + +if [ -n "${WHEEL_INPUT_DIR:-}" ]; then + printf '#!/bin/bash\necho "%s"\n' "$WHEEL_INPUT_DIR" > "$HOME/.local/bin/rapids-download-wheels-from-github" + chmod +x "$HOME/.local/bin/rapids-download-wheels-from-github" +fi + +export PATH="$HOME/.local/bin:$PATH" + +cd "$UCXX_DIR" + +# Wheel builds otherwise pick system gcc 8.5 (too old for libucxx's C++20); +# point CC/CXX at gcc-toolset-14. +if [[ "$phase" == wheel_* ]]; then + toolset=/opt/rh/gcc-toolset-14/root/usr/bin + [ -x "$toolset/gcc" ] \ + || { echo "ERROR: gcc-toolset-14 not found at $toolset (needed for libucxx C++20)" >&2; exit 1; } + export CC="$toolset/gcc" CXX="$toolset/g++" +fi + +case "$phase" in + conda_cpp) bash ci/build_cpp.sh ;; + conda_python) bash ci/build_python.sh ;; + wheel_libucxx) bash ci/build_wheel_libucxx.sh ;; + wheel_ucxx) + : "${WHEEL_INPUT_DIR:?WHEEL_INPUT_DIR required for wheel_ucxx (libucxx wheel dir)}" + bash ci/build_wheel_ucxx.sh ;; + docs) + # Upstream forces RAPIDS_DOCS_DIR=$(mktemp -d); make it default-if-unset + # so our staged output dir survives. Guard catches upstream rewording. + sed -i 's|RAPIDS_DOCS_DIR="$(mktemp -d)"|: "${RAPIDS_DOCS_DIR:=$(mktemp -d)}"|' ci/build_docs.sh + grep -q 'RAPIDS_DOCS_DIR:=' ci/build_docs.sh \ + || { echo "ERROR: docs patch did not apply to ci/build_docs.sh" >&2; exit 1; } + bash ci/build_docs.sh ;; + *) echo "Unknown phase: $phase" >&2; exit 1 ;; +esac diff --git a/buildlib/tools/test_ucxx.sh b/buildlib/tools/test_ucxx.sh new file mode 100755 index 00000000000..b398b8beeb3 --- /dev/null +++ b/buildlib/tools/test_ucxx.sh @@ -0,0 +1,88 @@ +#!/bin/bash -eE +# +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# See file LICENSE for terms. +# +# Usage: test_ucxx.sh +# Env: RAPIDS_CUDA_VERSION, RAPIDS_PY_VERSION, UCXX_DIR +# build|test_cpp|test_python: also IS_GPU +# test_wheel_ucxx: also LIBUCXX_WHL_DIR, UCXX_WHL_DIR + +phase=${1:?phase required} +case "$phase" in + build|test_cpp|test_python) : "${IS_GPU:?IS_GPU required}" ;; +esac +: "${RAPIDS_CUDA_VERSION:?RAPIDS_CUDA_VERSION required}" +: "${RAPIDS_PY_VERSION:?RAPIDS_PY_VERSION required}" +: "${UCXX_DIR:?UCXX_DIR required}" + +export RAPIDS_CUDA_VERSION RAPIDS_PY_VERSION +export RAPIDS_CONDA_BLD_OUTPUT_DIR=/tmp/conda-bld-output +mkdir -p "$RAPIDS_CONDA_BLD_OUTPUT_DIR" "$HOME/.local/bin" + +for tool in rapids-download-conda-from-github rapids-download-from-github; do + printf '#!/bin/bash\necho "%s"\n' "$RAPIDS_CONDA_BLD_OUTPUT_DIR" > "$HOME/.local/bin/$tool" + chmod +x "$HOME/.local/bin/$tool" +done +export PATH="$HOME/.local/bin:$PATH" + +cd "$UCXX_DIR" + +# Tolerate missing nvidia-smi on CPU containers. Guard catches upstream rewording. +sed -i 's#^ nvidia-smi$# command -v nvidia-smi >/dev/null \&\& nvidia-smi || echo "(no GPU)"#' ci/test_common.sh +grep -q 'command -v nvidia-smi' ci/test_common.sh \ + || { echo "ERROR: nvidia-smi patch did not apply to ci/test_common.sh" >&2; exit 1; } + +# Skip test_client_shutdown: its teardown crashes the xdist worker under +# full-pipeline GPU/MPS contention (flaky upstream test, not a UCX issue). +# Guard catches upstream rewording (else the skip silently disappears). +sed -i "s#--runslow#--runslow -k 'not test_client_shutdown'#" ci/run_python.sh +grep -q "not test_client_shutdown" ci/run_python.sh \ + || { echo "ERROR: test_client_shutdown skip did not apply to ci/run_python.sh" >&2; exit 1; } + +# Force host driver ahead of the image's newer compat driver (MPS rejects a client +# newer than the daemon -> cuInit hangs). ubuntu: /usr/lib/-linux-gnu; wheel: /usr/lib64. +arch=$(uname -m) +for hostlib in "/usr/lib/$arch-linux-gnu" /usr/lib64; do + [ -d "$hostlib" ] && export LD_LIBRARY_PATH="$hostlib:${LD_LIBRARY_PATH:-}" +done + +case "$phase" in + build) + if [ "${IS_GPU,,}" = "true" ]; then + # sccache wrapper crashes CMake's compiler probe on the GPU build hosts; no-op it. + cat > "$HOME/.local/bin/rapids-configure-sccache" <<'EOF' +#!/bin/bash +export CMAKE_C_COMPILER_LAUNCHER= CMAKE_CXX_COMPILER_LAUNCHER= CMAKE_CUDA_COMPILER_LAUNCHER= RUSTC_WRAPPER= +EOF + chmod +x "$HOME/.local/bin/rapids-configure-sccache" + fi + bash ci/build_cpp.sh + bash ci/build_python.sh + ;; + + test_cpp) + # CPU slices have no GPU device bound; CUDA-touching gtests would crash. + if [ "${IS_GPU,,}" = "true" ]; then + bash ci/test_cpp.sh + else + CUDA_VISIBLE_DEVICES= UCX_TLS=tcp,sm,self GTEST_FILTER='-RMM*.*:CCCL*.*' \ + bash ci/test_cpp.sh + fi + ;; + + test_python) + bash ci/test_python.sh + ;; + + test_wheel_ucxx) + : "${LIBUCXX_WHL_DIR:?LIBUCXX_WHL_DIR required}" + : "${UCXX_WHL_DIR:?UCXX_WHL_DIR required}" + printf '#!/bin/bash\necho "%s"\n' "$LIBUCXX_WHL_DIR" > "$HOME/.local/bin/rapids-download-wheels-from-github" + printf '#!/bin/bash\necho "%s"\n' "$UCXX_WHL_DIR" > "$HOME/.local/bin/rapids-download-from-github" + chmod +x "$HOME/.local/bin/rapids-download-wheels-from-github" "$HOME/.local/bin/rapids-download-from-github" + bash ci/test_wheel_ucxx.sh + ;; + + *) echo "Unknown phase: $phase" >&2; exit 1 ;; +esac