diff --git a/.ci/docker/README.md b/.ci/docker/README.md new file mode 100644 index 0000000000..7771f015f9 --- /dev/null +++ b/.ci/docker/README.md @@ -0,0 +1,91 @@ +# torch-npu CI Docker Images + +本目录管理 torch-npu 项目的 CI Docker 镜像,包括**构建镜像 (builder)** 和**测试镜像 (test)** 两类,每类分别支持 x86_64 和 aarch64 架构。 + +## 镜像类型 + +| 类型 | 基座 | 用途 | +|------|------|------| +| **builder** | manylinux2_28-builder | 编译构建 torch-npu wheel 包,包含完整编译工具链 | +| **test** | ubuntu:22.04 | CI 单元测试运行环境,包含 PyTorch CPU、CANN runtime、triton-ascend 和测试框架 | + +## 目录结构 + +``` +.ci/docker/ +├── README.md +├── requirements-builder.txt # Builder 镜像 pip 依赖 +├── requirements-test.txt # Test 镜像 pip 依赖 +├── docker_build.sh # 构建入口脚本 +├── common/ # 共享安装脚本 +│ ├── install_cann.sh # 安装 CANN toolkit (支持 A1/A2/A3) +│ ├── install_triton.sh # 安装 triton-ascend (需传 Python 版本) +│ ├── install_obs.sh # 安装华为 OBS util +├── builder/ +│ ├── Dockerfile.x86_64 +│ └── Dockerfile.aarch64 +└── test/ + ├── Dockerfile.x86_64 + └── Dockerfile.aarch64 +``` + +## 快速构建 + +```bash +# Builder 镜像 (不含 CANN) +./docker_build.sh torch-npu-builder-x86_64-torch2.7.1 +./docker_build.sh torch-npu-builder-aarch64-torch2.7.1 + +# Test 镜像 (含 CANN) +./docker_build.sh torch-npu-test-x86_64-cann-a1-py3.10-torch2.7.1 +./docker_build.sh torch-npu-test-aarch64-cann-a2-py3.10-torch2.7.1 +``` + +## Tag 命名规范 + +参考上游 PyTorch `pytorch-linux-jammy-cuda12.4-cudnn9-py3-gcc11` 模式,tag 即为最终镜像名: + +**Builder**(不含 CANN): +``` +torch-npu-builder--torch +``` +``` +./docker_build.sh torch-npu-builder-x86_64-torch2.7.1 +# ^ ^ ^ ^ +# | | | └── PyTorch 版本 (torch2.7.1) +# | | └── 架构 +# | └── 镜像类型 +# └── 固定前缀 +``` + +**Test**(含 CANN runtime): +``` +torch-npu-test--cann-py-torch +``` +``` +./docker_build.sh torch-npu-test-x86_64-cann-a1-py3.10-torch2.7.1 +# ^ ^ ^ ^ ^ ^ ^ +# | | | | | | └── PyTorch 版本 +# | | | | | └── torch 前缀 +# | | | | └── Python 版本 +# | | | └── py 前缀 +# | | └── CANN 芯片 (A1/A2/A3) +# | └── 架构 +# └── 镜像类型 +``` + +| 字段 | 可选值 | +|------|--------| +| IMAGE_TYPE | builder, test | +| ARCH | x86_64, aarch64 | +| CHIP | A1 (Ascend 910), A2 (Ascend 910b), A3 (仅 test) | +| PYTHON_VERSION | 3.10 (仅 test) | +| PYTORCH_VERSION | 2.7.1 | + +## CANN 芯片映射 + +| CANN_CHIP | 芯片 | CANN 版本 | +|-----------|------|----------| +| A1 | Ascend 910 | 9.1.0 | +| A2 | Ascend 910b | 8.5.0 (x86_64) / 9.1.0 (aarch64) | +| A3 | Ascend A3 | 9.0.0-beta.1 (x86_64) / 9.0.0-beta.2 (aarch64) | diff --git a/.ci/docker/builder/Dockerfile.aarch64 b/.ci/docker/builder/Dockerfile.aarch64 new file mode 100644 index 0000000000..7cc382123c --- /dev/null +++ b/.ci/docker/builder/Dockerfile.aarch64 @@ -0,0 +1,71 @@ +FROM pytorch/manylinux2_28_aarch64-builder:cpu-aarch64-2.7 + +ARG PYTORCH_VERSION=2.7.1 + +ENV PATH=/usr/local/bin:$PATH +ENV AUDITWHEEL_PLAT=manylinux_2_28_aarch64 +ENV ETCD_UNSUPPORTED_ARCH=arm64 +ENV PYTORCH_VERSION=${PYTORCH_VERSION} + +COPY requirements-builder.txt /opt/buildtools/ + +# Set pip & python symlinks +RUN cd /usr/local/bin \ + && ln -sf /opt/_internal/cpython-3.9.21/bin/pip3.9 pip3.9 \ + && ln -sf /opt/_internal/cpython-3.10.16/bin/pip3.10 pip3.10 \ + && ln -sf /opt/_internal/cpython-3.11.11/bin/pip3.11 pip3.11 \ + && ln -sf /opt/_internal/cpython-3.12.9/bin/pip3.12 pip3.12 \ + && ln -sf /opt/_internal/cpython-3.13.2/bin/pip3.13 pip3.13 \ + && ln -sf /opt/_internal/cpython-3.10.16/bin/pip3.10 pip3 \ + && ln -sf /opt/_internal/cpython-3.9.21/bin/python3.9 python3.9 \ + && ln -sf /opt/_internal/cpython-3.10.16/bin/python3.10 python3.10 \ + && ln -sf /opt/_internal/cpython-3.11.11/bin/python3.11 python3.11 \ + && ln -sf /opt/_internal/cpython-3.12.9/bin/python3.12 python3.12 \ + && ln -sf /opt/_internal/cpython-3.13.2/bin/python3.13 python3.13 \ + && ln -sf /opt/_internal/cpython-3.10.16/bin/python3.10 python3 + +# Install PyTorch from official source, then build requirements from PyPI for each python version +RUN pip3.9 install --no-cache-dir --index-url https://download.pytorch.org/whl/cpu torch==${PYTORCH_VERSION} \ + && pip3.9 install --no-cache-dir -r /opt/buildtools/requirements-builder.txt \ + && pip3.10 install --no-cache-dir --index-url https://download.pytorch.org/whl/cpu torch==${PYTORCH_VERSION} \ + && pip3.10 install --no-cache-dir -r /opt/buildtools/requirements-builder.txt \ + && pip3.11 install --no-cache-dir --index-url https://download.pytorch.org/whl/cpu torch==${PYTORCH_VERSION} \ + && pip3.11 install --no-cache-dir -r /opt/buildtools/requirements-builder.txt \ + && pip3.12 install --no-cache-dir --index-url https://download.pytorch.org/whl/cpu torch==${PYTORCH_VERSION} \ + && pip3.12 install --no-cache-dir -r /opt/buildtools/requirements-builder.txt \ + && pip3.13 install --no-cache-dir --index-url https://download.pytorch.org/whl/cpu torch==${PYTORCH_VERSION} \ + && pip3.13 install --no-cache-dir auditwheel==5.4.0 \ + && pip3.13 install --no-cache-dir -r /opt/buildtools/requirements-builder.txt \ + && ln -sf /opt/_internal/cpython-3.13.2/bin/auditwheel /usr/local/bin/auditwheel + +# Install system build tools +RUN echo "alias ll='ls -l --color=auto'" >> /root/.bashrc \ + && yum install -y vim-common --disablerepo=ius \ + && yum install -y ninja-build binutils lld mold dos2unix gcc gcc-c++ make cmake3 wget tar unzip elfutils java-1.8.0-openjdk-devel \ + && cd /tmp \ + && wget https://github.com/ccache/ccache/releases/download/v4.10/ccache-4.10.tar.gz \ + && tar -xzf ccache-4.10.tar.gz \ + && cd ccache-4.10 \ + && mkdir build \ + && cd build \ + && cmake3 .. \ + && make -j$(nproc) \ + && make install \ + && cd /tmp \ + && rm -rf ccache-4.10* \ + && ccache --version \ + && wget https://github.com/etcd-io/etcd/releases/download/v3.4.3/etcd-v3.4.3-linux-arm64.tar.gz \ + && tar -zxf etcd-v3.4.3-linux-arm64.tar.gz \ + && mv etcd-v3.4.3-linux-arm64/etcd /usr/local/bin/ \ + && pip3.10 install python-etcd \ + && etcd --version \ + && yum update -y \ + && yum clean all + +# Set timezone +RUN rm -f /etc/localtime \ + && cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \ + && echo 'Asia/Shanghai' >/etc/timezone \ + && echo "export TZ='Asia/Shanghai'" >>/etc/profile + +WORKDIR /home diff --git a/.ci/docker/builder/Dockerfile.x86_64 b/.ci/docker/builder/Dockerfile.x86_64 new file mode 100644 index 0000000000..551adc2626 --- /dev/null +++ b/.ci/docker/builder/Dockerfile.x86_64 @@ -0,0 +1,82 @@ +FROM pytorch/manylinux2_28-builder:cpu-2.7 + +ARG PYTORCH_VERSION=2.7.1 + +ENV PATH=/usr/local/bin:$PATH +ENV AUDITWHEEL_PLAT=manylinux_2_28_x86_64 +ENV PYTORCH_VERSION=${PYTORCH_VERSION} + +COPY requirements-builder.txt /opt/buildtools/ + +# Set pip & python symlinks +RUN cd /usr/local/bin \ + && ln -sf /opt/_internal/cpython-3.9.21/bin/pip3.9 pip3.9 \ + && ln -sf /opt/_internal/cpython-3.10.16/bin/pip3.10 pip3.10 \ + && ln -sf /opt/_internal/cpython-3.11.11/bin/pip3.11 pip3.11 \ + && ln -sf /opt/_internal/cpython-3.12.9/bin/pip3.12 pip3.12 \ + && ln -sf /opt/_internal/cpython-3.13.2/bin/pip3.13 pip3.13 \ + && ln -sf /opt/_internal/cpython-3.10.16/bin/pip3.10 pip3 \ + && ln -sf /opt/_internal/cpython-3.9.21/bin/python3.9 python3.9 \ + && ln -sf /opt/_internal/cpython-3.10.16/bin/python3.10 python3.10 \ + && ln -sf /opt/_internal/cpython-3.11.11/bin/python3.11 python3.11 \ + && ln -sf /opt/_internal/cpython-3.12.9/bin/python3.12 python3.12 \ + && ln -sf /opt/_internal/cpython-3.13.2/bin/python3.13 python3.13 \ + && ln -sf /opt/_internal/cpython-3.10.16/bin/python3.10 python3 + +# Install PyTorch from official source, then build requirements from PyPI for each python version +RUN pip3.9 install --no-cache-dir --index-url https://download.pytorch.org/whl/cpu torch==${PYTORCH_VERSION}+cpu \ + && pip3.9 install --no-cache-dir -r /opt/buildtools/requirements-builder.txt \ + && pip3.10 install --no-cache-dir --index-url https://download.pytorch.org/whl/cpu torch==${PYTORCH_VERSION}+cpu \ + && pip3.10 install --no-cache-dir -r /opt/buildtools/requirements-builder.txt \ + && pip3.11 install --no-cache-dir --index-url https://download.pytorch.org/whl/cpu torch==${PYTORCH_VERSION}+cpu \ + && pip3.11 install --no-cache-dir -r /opt/buildtools/requirements-builder.txt \ + && pip3.12 install --no-cache-dir --index-url https://download.pytorch.org/whl/cpu torch==${PYTORCH_VERSION}+cpu \ + && pip3.12 install --no-cache-dir -r /opt/buildtools/requirements-builder.txt \ + && pip3.13 install --no-cache-dir --index-url https://download.pytorch.org/whl/cpu torch==${PYTORCH_VERSION}+cpu \ + && pip3.13 install --no-cache-dir auditwheel==5.4.0 \ + && pip3.13 install --no-cache-dir -r /opt/buildtools/requirements-builder.txt \ + && ln -sf /opt/_internal/cpython-3.13.2/bin/auditwheel /usr/local/bin/auditwheel + +# Install system build tools +RUN yum remove -y ius-release epel-release 2>/dev/null || true \ + && rm -rf /etc/yum.repos.d/ius*.repo /etc/yum.repos.d/epel*.repo \ + && yum clean all && rm -rf /var/cache/dnf /var/cache/yum \ + && echo "alias ll='ls -l --color=auto'" >> /root/.bashrc \ + && yum install -y vim-common --disablerepo=ius \ + && yum install -y binutils lld dos2unix gcc gcc-c++ make cmake3 wget tar unzip elfutils java-1.8.0-openjdk-devel \ + && cd /tmp \ + && wget -q https://github.com/ninja-build/ninja/releases/download/v1.12.1/ninja-linux.zip \ + && unzip ninja-linux.zip \ + && cp ninja /usr/local/bin/ && chmod +x /usr/local/bin/ninja \ + && cd /tmp \ + && wget -q https://github.com/rui314/mold/archive/refs/tags/v2.32.1.tar.gz \ + && tar -xf v2.32.1.tar.gz \ + && cd mold-2.32.1 \ + && cmake -DCMAKE_BUILD_TYPE=Release -DMOLD_MOSTLY_STATIC=ON . \ + && make -j$(nproc) && make install \ + && cd /tmp \ + && wget https://github.com/ccache/ccache/releases/download/v4.10/ccache-4.10.tar.gz \ + && tar -xzf ccache-4.10.tar.gz \ + && cd ccache-4.10 \ + && mkdir build \ + && cd build \ + && cmake3 .. \ + && make -j$(nproc) \ + && make install \ + && cd /tmp && rm -rf /tmp/* \ + && ninja --version && mold --version && ccache --version \ + && wget https://github.com/etcd-io/etcd/releases/download/v3.4.3/etcd-v3.4.3-linux-amd64.tar.gz \ + && tar -zxf etcd-v3.4.3-linux-amd64.tar.gz \ + && mv etcd-v3.4.3-linux-amd64/etcd /usr/local/bin/ \ + && pip3.10 install python-etcd \ + && etcd --version \ + && yum clean all \ + && yum update -y + +# Set timezone +RUN rm -f /etc/localtime \ + && cp /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \ + && echo 'Asia/Shanghai' >/etc/timezone \ + && echo "export TZ='Asia/Shanghai'" >>/etc/profile + +WORKDIR /home diff --git a/.ci/docker/common/install_cann.sh b/.ci/docker/common/install_cann.sh new file mode 100755 index 0000000000..4af49bbb89 --- /dev/null +++ b/.ci/docker/common/install_cann.sh @@ -0,0 +1,109 @@ +#!/usr/bin/bash +# Install CANN toolkit for Ascend NPU. +# Usage: CANN_CHIP=A1 ./install_cann.sh +# CANN_CHIP: A1 (Ascend 910), A2 (Ascend 910b), A3 (Ascend A3) +# Automatically detects architecture (x86_64 / aarch64). + +set -e + +CANN_CHIP="${CANN_CHIP:-A1}" +ARCH=$(uname -m) + +BASE_URL="https://pytorch-package.obs.cn-north-4.myhuaweicloud.com/pta/cann-package" +CANN_BASE_URL="https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%209.1.T1" + +case "${ARCH}_${CANN_CHIP}" in + # x86_64 + x86_64_A1) + TOOLKIT_URL="${BASE_URL}/20260513/Ascend-cann-toolkit_9.1.0_linux-x86_64.run" + OPS_URL="${BASE_URL}/20260513/Ascend-cann-910-ops_9.1.0_linux-x86_64.run" + NNAL_URL="${BASE_URL}/20260513/Ascend-cann-nnal_9.1.0_linux-x86_64.run" + OPS_GLOB="Ascend-cann-910*" + SET_ENV_PATH="/usr/local/Ascend/cann/set_env.sh" + ;; + x86_64_A2) + TOOLKIT_URL="${CANN_BASE_URL}/Ascend-cann-toolkit_9.1.0-beta.1_linux-x86_64.run" + OPS_URL="${CANN_BASE_URL}/Ascend-cann-910b-ops_9.1.0-beta.1_linux-x86_64.run" + NNAL_URL="${CANN_BASE_URL}/Ascend-cann-nnal_9.1.0-beta.1_linux-x86_64.run" + OPS_GLOB="Ascend-cann-910b-ops*" + SET_ENV_PATH="/usr/local/Ascend/cann/set_env.sh" + ;; + x86_64_A3) + TOOLKIT_URL="${CANN_BASE_URL}/Ascend-cann-toolkit_9.1.0-beta.1_linux-x86_64.run" + OPS_URL="${CANN_BASE_URL}/Ascend-cann-A3-ops_9.1.0-beta.1_linux-x86_64.run" + NNAL_URL="${CANN_BASE_URL}/Ascend-cann-nnal_9.1.0-beta.1_linux-x86_64.run" + OPS_GLOB="Ascend-cann-A3-ops*" + SET_ENV_PATH="/usr/local/Ascend/cann/set_env.sh" + ;; + # aarch64 + aarch64_A1) + TOOLKIT_URL="${BASE_URL}/20260302/Ascend-cann-toolkit_9.0.0-beta.1_linux-aarch64.run" + OPS_URL="${BASE_URL}/20260302/Ascend-cann-910b-ops_9.0.0-beta.1_linux-aarch64.run" + NNAL_URL="${BASE_URL}/20260302/Ascend-cann-nnal_9.0.0-beta.1_linux-aarch64.run" + OPS_GLOB="Ascend-cann-910b*" + SET_ENV_PATH="/usr/local/Ascend/cann/set_env.sh" + ;; + aarch64_A2) + TOOLKIT_URL="${CANN_BASE_URL}/Ascend-cann-toolkit_9.1.0-beta.1_linux-aarch64.run" + OPS_URL="${CANN_BASE_URL}/Ascend-cann-910b-ops_9.1.0-beta.1_linux-aarch64.run" + NNAL_URL="${CANN_BASE_URL}/Ascend-cann-nnal_9.1.0-beta.1_linux-aarch64.run" + OPS_GLOB="Ascend-cann-910b-ops*" + SET_ENV_PATH="/usr/local/Ascend/cann/set_env.sh" + ;; + aarch64_A3) + TOOLKIT_URL="${CANN_BASE_URL}/Ascend-cann-toolkit_9.1.0-beta.1_linux-aarch64.run" + OPS_URL="${CANN_BASE_URL}/Ascend-cann-A3-ops_9.1.0-beta.1_linux-aarch64.run" + NNAL_URL="${CANN_BASE_URL}/Ascend-cann-nnal_9.1.0-beta.1_linux-aarch64.run" + OPS_GLOB="Ascend-cann-A3-ops*" + SET_ENV_PATH="/usr/local/Ascend/cann/set_env.sh" + ;; + *) + echo "Unsupported combination: ${ARCH} + ${CANN_CHIP}" + exit 1 + ;; +esac + +echo "Installing CANN ${CANN_CHIP} for ${ARCH}..." + +echo "=== Creating HwHiAiUser user and group ===" +groupadd -f HwHiAiUser +id -u HwHiAiUser >/dev/null 2>&1 || useradd -g HwHiAiUser -d /home/HwHiAiUser -m HwHiAiUser -s /bin/bash + +rm -rf cann +mkdir -p cann && cd cann + +echo "=== Downloading CANN packages ===" +curl -O "${TOOLKIT_URL}" +curl -O "${OPS_URL}" +curl -O "${NNAL_URL}" +echo "Download complete." + +chmod +x Ascend-cann*.run + +echo "=== Installing CANN toolkit ===" +./Ascend-cann-toolkit*.run --full --quiet --install-path=/usr/local/Ascend +source "${SET_ENV_PATH}" +echo "toolkit install success" + +echo "=== Installing CANN ops ===" +./${OPS_GLOB}.run --install --quiet --install-path=/usr/local/Ascend +echo "ops install success" + +echo "=== Installing CANN nnal ===" +./Ascend-cann-nnal*.run --install --quiet --install-path=/usr/local/Ascend +source /usr/local/Ascend/nnal/atb/set_env.sh +echo "nnal install success" + +# Some CANN versions install to versioned paths (e.g. cann-9.0.0-beta.2) +# instead of /usr/local/Ascend/cann/. Fix broken symlinks so runtime +# sourcing of set_env.sh works. +if [ ! -f /usr/local/Ascend/cann/set_env.sh ]; then + CANN_REAL_DIR=$(ls -d /usr/local/Ascend/cann-* 2>/dev/null | head -1) + if [ -n "${CANN_REAL_DIR}" ]; then + ln -sf "${CANN_REAL_DIR}" /usr/local/Ascend/cann + echo "Fixed: linked ${CANN_REAL_DIR} -> /usr/local/Ascend/cann" + fi +fi + +rm -rf * +echo "CANN ${CANN_CHIP} installation complete." \ No newline at end of file diff --git a/.ci/docker/common/install_obs.sh b/.ci/docker/common/install_obs.sh new file mode 100755 index 0000000000..1acea84f2b --- /dev/null +++ b/.ci/docker/common/install_obs.sh @@ -0,0 +1,21 @@ +#!/usr/bin/bash +# Install Huawei OBS util for object storage access. + +set -e + +ARCH=$(uname -m) +case "${ARCH}" in + x86_64) OBS_ARCH="amd64" ;; + aarch64) OBS_ARCH="arm64" ;; + *) echo "Unsupported architecture: ${ARCH}"; exit 1 ;; +esac + +OBS_URL="https://obs-community.obs.cn-north-1.myhuaweicloud.com/obsutil/current/obsutil_linux_${OBS_ARCH}.tar.gz" + +wget -q "${OBS_URL}" +mkdir -p /usr/local/obsutil +tar -zxf "obsutil_linux_${OBS_ARCH}.tar.gz" -C /usr/local/obsutil/ +rm -f "obsutil_linux_${OBS_ARCH}.tar.gz" +ln -sf /usr/local/obsutil/obsutil_linux_${OBS_ARCH}_*/obsutil /usr/local/bin/obsutil + +echo "OBS util installed." diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh new file mode 100755 index 0000000000..ed76bca16d --- /dev/null +++ b/.ci/docker/common/install_triton.sh @@ -0,0 +1,19 @@ +#!/usr/bin/bash +# Install triton-ascend for NPU. +# Usage: ./install_triton.sh +# PYTHON_VERSION: e.g. 3.10, 3.11, 3.12, 3.13 + +set -e + +TRITON_VERSION="${TRITON_VERSION:-3.2.1}" +PYTHON_VERSION="${1:?Usage: $0 (e.g. 3.10)}" + +ARCH=$(uname -m) +PY_SHORT=$(echo "${PYTHON_VERSION}" | tr -d '.') + +TRITON_WHL="triton_ascend-${TRITON_VERSION}-cp${PY_SHORT}-cp${PY_SHORT}-manylinux_2_27_${ARCH}.manylinux_2_28_${ARCH}.whl" +TRITON_URL="https://gitcode.com/Ascend/triton-ascend/releases/download/v${TRITON_VERSION}/${TRITON_WHL}" + +echo "Installing triton-ascend ${TRITON_VERSION} for Python ${PYTHON_VERSION} (${ARCH})..." +pip3 install --no-cache-dir "${TRITON_URL}" +echo "triton-ascend installed." diff --git a/.ci/docker/docker_build.sh b/.ci/docker/docker_build.sh new file mode 100755 index 0000000000..2183f141d6 --- /dev/null +++ b/.ci/docker/docker_build.sh @@ -0,0 +1,115 @@ +#!/usr/bin/bash +# Build torch-npu CI Docker images. +# +# Usage: +# ./docker_build.sh +# +# Builder: torch-npu-builder--torch +# Test: torch-npu-test--cann-py-torch +# +# Examples: +# ./docker_build.sh torch-npu-builder-x86_64-torch2.7.1 +# ./docker_build.sh torch-npu-test-aarch64-cann-a2-py3.10-torch2.7.1 +# +# Reference: pytorch/pytorch .ci/docker/build.sh + +set -ex + +tag="${1:?Usage: $0 }" +shift + +case "$tag" in + torch-npu-builder-x86_64-torch2.7.1) + IMAGE_TYPE=builder + ARCH=x86_64 + PYTORCH_VERSION=2.7.1 + ;; + torch-npu-builder-aarch64-torch2.7.1) + IMAGE_TYPE=builder + ARCH=aarch64 + PYTORCH_VERSION=2.7.1 + ;; + torch-npu-test-x86_64-cann-a1-py3.10-torch2.7.1) + IMAGE_TYPE=test + ARCH=x86_64 + CANN_CHIP=A1 + PYTHON_VERSION=3.10 + PYTORCH_VERSION=2.7.1 + ;; + torch-npu-test-x86_64-cann-a2-py3.10-torch2.7.1) + IMAGE_TYPE=test + ARCH=x86_64 + CANN_CHIP=A2 + PYTHON_VERSION=3.10 + PYTORCH_VERSION=2.7.1 + ;; + torch-npu-test-x86_64-cann-a3-py3.10-torch2.7.1) + IMAGE_TYPE=test + ARCH=x86_64 + CANN_CHIP=A3 + PYTHON_VERSION=3.10 + PYTORCH_VERSION=2.7.1 + ;; + torch-npu-test-aarch64-cann-a1-py3.10-torch2.7.1) + IMAGE_TYPE=test + ARCH=aarch64 + CANN_CHIP=A1 + PYTHON_VERSION=3.10 + PYTORCH_VERSION=2.7.1 + ;; + torch-npu-test-aarch64-cann-a2-py3.10-torch2.7.1) + IMAGE_TYPE=test + ARCH=aarch64 + CANN_CHIP=A2 + PYTHON_VERSION=3.10 + PYTORCH_VERSION=2.7.1 + ;; + torch-npu-test-aarch64-cann-a3-py3.10-torch2.7.1) + IMAGE_TYPE=test + ARCH=aarch64 + CANN_CHIP=A3 + PYTHON_VERSION=3.10 + PYTORCH_VERSION=2.7.1 + ;; + *) + echo "Unknown tag: ${tag}" + echo " Builder: torch-npu-builder--torch2.7.1" + echo " Test: torch-npu-test--cann-py3.10-torch2.7.1" + exit 1 + ;; +esac + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +DOCKERFILE="${SCRIPT_DIR}/${IMAGE_TYPE}/Dockerfile.${ARCH}" + +if [[ ! -f "${DOCKERFILE}" ]]; then + echo "Dockerfile not found: ${DOCKERFILE}" + exit 1 +fi + +BUILD_ARGS=( + --build-arg PYTORCH_VERSION="${PYTORCH_VERSION}" +) +if [[ -n "${CANN_CHIP:-}" ]]; then + BUILD_ARGS+=(--build-arg CANN_CHIP="${CANN_CHIP}") +fi +if [[ -n "${PYTHON_VERSION:-}" ]]; then + BUILD_ARGS+=(--build-arg PYTHON_VERSION="${PYTHON_VERSION}") +fi + +TIMESTAMP="${TIMESTAMP:-$(date -u +%Y%m%d%H%M)}" +IMAGE_TAG="${tag}-${TIMESTAMP}" + +echo "Building ${IMAGE_TAG} ..." +echo " Dockerfile: ${DOCKERFILE}" +echo " PyTorch: ${PYTORCH_VERSION}" +[[ -n "${PYTHON_VERSION:-}" ]] && echo " Python: ${PYTHON_VERSION}" +[[ -n "${CANN_CHIP:-}" ]] && echo " CANN chip: ${CANN_CHIP}" + +docker build \ + -f "${DOCKERFILE}" \ + -t "${IMAGE_TAG}" \ + "${BUILD_ARGS[@]}" \ + "${SCRIPT_DIR}" + +echo "Image built: ${IMAGE_TAG}" diff --git a/.ci/docker/requirements-builder.txt b/.ci/docker/requirements-builder.txt new file mode 100644 index 0000000000..871634e0df --- /dev/null +++ b/.ci/docker/requirements-builder.txt @@ -0,0 +1,5 @@ +numpy==1.26.4 +pybind11==2.13.1 +pyyaml +setuptools==75.3.2 +wheel diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt deleted file mode 100644 index 8602d4d0fa..0000000000 --- a/.ci/docker/requirements-ci.txt +++ /dev/null @@ -1,7 +0,0 @@ -# Python dependencies required for unit tests - -mypy==1.9.0 -# Pin MyPy version because new errors are likely to appear with each release -#Description: linter -#Pinned versions: 1.9.0 -#test that import: test_typing.py, test_type_hints.py diff --git a/.ci/docker/requirements-test.txt b/.ci/docker/requirements-test.txt new file mode 100644 index 0000000000..ecb003b272 --- /dev/null +++ b/.ci/docker/requirements-test.txt @@ -0,0 +1,42 @@ +# Python dependencies required for CI unit tests + +-f https://data.pyg.org/whl/torch-2.7.1+cpu.html + +# Test frameworks +pytest==8.1.1 +pytest-xdist +pytest-subtests +coverage +hypothesis +parameterized==0.9.0 +expecttest==0.1.3 +unittest-xml-reporting + +mypy==1.14.0 + +ml-dtypes==0.5.1 +numpy==1.26.4 +pytest-timeout==2.3.1 + +onnx==1.17.0 +onnxruntime==1.18.1 +onnxscript==0.2.2 + +Pillow==10.3.0 +protobuf==3.20.2 +requests==2.32.0 + +torch_geometric==2.5.3 +torch-scatter==2.1.2 +torchvision==0.22.1 +transformers==4.40.0 + +tabulate==0.9.0 +importlib_metadata +optree +packaging +psutil +scipy +z3-solver==4.13.0.0 +zstandard==0.25.0 +pulp==3.3.1 diff --git a/.ci/docker/test/Dockerfile.aarch64 b/.ci/docker/test/Dockerfile.aarch64 new file mode 100644 index 0000000000..1b44929e00 --- /dev/null +++ b/.ci/docker/test/Dockerfile.aarch64 @@ -0,0 +1,54 @@ +FROM ubuntu:22.04 + +ARG PYTORCH_VERSION=2.7.1 +ARG CANN_CHIP=A2 +ARG PYTHON_VERSION=3.10 + +ENV DEBIAN_FRONTEND=noninteractive +ENV TZ=Asia/Shanghai +ENV PATH=/usr/local/bin:$PATH +ENV PYTORCH_VERSION=${PYTORCH_VERSION} +ENV CANN_CHIP=${CANN_CHIP} + +COPY common/ /opt/buildtools/ +COPY requirements-test.txt /opt/buildtools/ + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + curl \ + dos2unix \ + gcc \ + g++ \ + git \ + make \ + python3 \ + python3-dev \ + python3-pip \ + python3-venv \ + tar \ + tzdata \ + unzip \ + vim \ + wget \ + && ln -sf /usr/bin/python3 /usr/bin/python \ + && rm -rf /var/lib/apt/lists/* + +# Upgrade pip/setuptools/wheel +RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel + +# Install CANN and OBS +RUN chmod -R 755 /opt/buildtools/* \ + && dos2unix /opt/buildtools/* \ + && /opt/buildtools/install_cann.sh \ + && /opt/buildtools/install_obs.sh + +# Install triton-ascend +RUN /opt/buildtools/install_triton.sh 3.10 + +# Install PyTorch from official source, then test requirements from PyPI +RUN python3 -m pip install --no-cache-dir --index-url https://download.pytorch.org/whl/cpu torch==${PYTORCH_VERSION} \ + && python3 -m pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /opt/buildtools/requirements-test.txt + +WORKDIR /home diff --git a/.ci/docker/test/Dockerfile.x86_64 b/.ci/docker/test/Dockerfile.x86_64 new file mode 100644 index 0000000000..e698f04d6c --- /dev/null +++ b/.ci/docker/test/Dockerfile.x86_64 @@ -0,0 +1,54 @@ +FROM ubuntu:22.04 + +ARG PYTORCH_VERSION=2.7.1 +ARG CANN_CHIP=A1 +ARG PYTHON_VERSION=3.10 + +ENV DEBIAN_FRONTEND=noninteractive +ENV TZ=Asia/Shanghai +ENV PATH=/usr/local/bin:$PATH +ENV PYTORCH_VERSION=${PYTORCH_VERSION} +ENV CANN_CHIP=${CANN_CHIP} + +COPY common/ /opt/buildtools/ +COPY requirements-test.txt /opt/buildtools/ + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + build-essential \ + ca-certificates \ + curl \ + dos2unix \ + gcc \ + g++ \ + git \ + make \ + python3 \ + python3-dev \ + python3-pip \ + python3-venv \ + tar \ + tzdata \ + unzip \ + vim \ + wget \ + && ln -sf /usr/bin/python3 /usr/bin/python \ + && rm -rf /var/lib/apt/lists/* + +# Upgrade pip/setuptools/wheel +RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel + +# Install CANN and OBS +RUN chmod -R 755 /opt/buildtools/* \ + && dos2unix /opt/buildtools/* \ + && /opt/buildtools/install_cann.sh \ + && /opt/buildtools/install_obs.sh + +# Install triton-ascend +RUN /opt/buildtools/install_triton.sh 3.10 + +# Install PyTorch from official source, then test requirements from PyPI +RUN python3 -m pip install --no-cache-dir --index-url https://download.pytorch.org/whl/cpu torch==${PYTORCH_VERSION} \ + && python3 -m pip install --no-cache-dir --extra-index-url https://download.pytorch.org/whl/cpu -r /opt/buildtools/requirements-test.txt + +WORKDIR /home diff --git a/.github/scripts/detect_changed_patches.sh b/.github/scripts/detect_changed_patches.sh index 59fb6c8dda..f0738d6a09 100644 --- a/.github/scripts/detect_changed_patches.sh +++ b/.github/scripts/detect_changed_patches.sh @@ -73,9 +73,7 @@ while IFS= read -r f; do # test_upstream/test/test_autograd.py.patch → test_autograd.py # test_upstream/test/ao/test_foo.py.patch → ao/test_foo.py # test_upstream/test/inductor/test_minifer.diff → inductor/test_minifer.py - TEST_FILE=$(echo "$f" | sed 's|^test_upstream/test/||; s|\.patch$||; s|\.diff$||') - # Ensure .py extension for cases where patch suffix was on bare name (e.g. test_foo.diff) - [[ "$TEST_FILE" != *.py ]] && TEST_FILE="${TEST_FILE}.py" + TEST_FILE=$(echo "$f" | sed 's|^test_upstream/test/||; s|\.patch$||; s|\.diff$|.py|') TEST_PATCHES="${TEST_PATCHES}${f}," TEST_FILES="${TEST_FILES}${TEST_FILE}," echo " → test patch: $f → test file: ${TEST_FILE}" diff --git a/.github/scripts/run_npu_test_shard.py b/.github/scripts/run_npu_test_shard.py index 15c591cd52..5e6f2dfa8b 100644 --- a/.github/scripts/run_npu_test_shard.py +++ b/.github/scripts/run_npu_test_shard.py @@ -795,6 +795,9 @@ def _execute_worker_batch( timeout, verbose, shard, shard_type, npu_device_id, ) + unexpected_stdout_dir = report_dir / "unexpected_stdout" + unexpected_stdout_dir.mkdir(parents=True, exist_ok=True) + while remaining_cases: batch_input["cases"] = [ { @@ -828,19 +831,49 @@ def _execute_worker_batch( last_output_time = monotonic() + unexpected_log_path = unexpected_stdout_dir / f"batch_{batch_id}.log" + unexpected_count = 0 + unexpected_lock = threading.Lock() + def _read_stdout(): - nonlocal last_output_time + nonlocal last_output_time, unexpected_count if proc.stdout: for line in proc.stdout: last_output_time = monotonic() - line = line.strip() - if not line: + raw_line = line.strip() + if not raw_line: continue try: - case_result = json.loads(line) + case_result = json.loads(raw_line) except json.JSONDecodeError: continue + if not isinstance(case_result, dict): + with unexpected_lock: + unexpected_count += 1 + count = unexpected_count + ts = datetime.now().isoformat() + json_type = type(case_result).__name__ + line_preview = raw_line[:10000] + try: + with open(unexpected_log_path, "a", encoding="utf-8") as uf: + uf.write( + f"[{ts}] #{count} type={json_type}" + f" len={len(raw_line)}\n" + f"{line_preview}\n" + f"{'-' * 80}\n" + ) + except OSError: + pass + if count == 1: + print( + f" [Batch {batch_id}] Unexpected non-dict JSON line" + f" (type={json_type}, len={len(raw_line)})," + f" full details saved to {unexpected_log_path}", + flush=True, + ) + continue + nodeid = case_result.get("nodeid", "") status = case_result.get("status", "error") duration = case_result.get("duration", 0.0) diff --git a/.github/workflows/_torch-npu-upstream-collect.yml b/.github/workflows/_torch-npu-upstream-collect.yml index 8efe91a424..8225a4f7e0 100644 --- a/.github/workflows/_torch-npu-upstream-collect.yml +++ b/.github/workflows/_torch-npu-upstream-collect.yml @@ -67,7 +67,7 @@ jobs: steps: - name: Setup NPU test environment - uses: Ascend/pytorch/.github/actions/setup-npu-test-env@v2.7.1 + uses: kerer-ai/pytorch/.github/actions/setup-npu-test-env@v2.7.1_image with: python_version: ${{ inputs.python_version }} torch_npu_wheel_artifact: ${{ inputs.torch_npu_wheel_artifact }} @@ -122,8 +122,8 @@ jobs: echo "total_cases=${TOTAL_CASES}" >> $GITHUB_OUTPUT echo "=== Shard configuration ===" - echo "Distributed tests: ${DISTRIBUTED_SHARDS} shards (case-level, serial execution, linux-aarch64-a3-16)" - echo "Regular tests: ${REGULAR_SHARDS} shards (case-level, 64 workers, linux-aarch64-a3-16)" + echo "Distributed tests: ${DISTRIBUTED_SHARDS} shards (case-level, serial execution, linux-aarch64-a3-8)" + echo "Regular tests: ${REGULAR_SHARDS} shards (case-level, 16 workers, linux-aarch64-a3-16)" echo "Total cases: ${TOTAL_CASES}" # Package error logs if any (place at workspace root for flat artifact layout) diff --git a/.github/workflows/_torch-npu-upstream-report.yml b/.github/workflows/_torch-npu-upstream-report.yml index 23bf4f62b8..ffa95209af 100644 --- a/.github/workflows/_torch-npu-upstream-report.yml +++ b/.github/workflows/_torch-npu-upstream-report.yml @@ -112,7 +112,7 @@ jobs: --patch-count "${{ inputs.patch_count }}" \ --shard-matrix-json "${COMBINED_MATRIX}" \ --docker-image "${{ inputs.docker_image }}" \ - --runner "linux-aarch64-a3-16 (distributed, serial), linux-aarch64-a3-16 (regular, 64 workers), linux-aarch64-a3-8 (custom)" \ + --runner "linux-aarch64-a3-8 (distributed, serial), linux-aarch64-a3-16 (regular, 16 workers), linux-aarch64-a3-16 (custom)" \ --cases-summary cases-shards/cases_collection_summary.json \ --cases-by-file-dir cases-shards diff --git a/.github/workflows/_torch-npu-upstream-test-custom.yml b/.github/workflows/_torch-npu-upstream-test-custom.yml index 3aa1ba2294..0f637f3d45 100644 --- a/.github/workflows/_torch-npu-upstream-test-custom.yml +++ b/.github/workflows/_torch-npu-upstream-test-custom.yml @@ -40,7 +40,7 @@ jobs: steps: - name: Setup NPU test environment - uses: Ascend/pytorch/.github/actions/setup-npu-test-env@v2.7.1 + uses: kerer-ai/pytorch/.github/actions/setup-npu-test-env@v2.7.1_image with: python_version: ${{ inputs.python_version }} torch_npu_wheel_artifact: ${{ inputs.torch_npu_wheel_artifact }} @@ -49,6 +49,8 @@ jobs: - name: Run custom test files id: run_tests + env: + CI: '' run: | source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true diff --git a/.github/workflows/_torch-npu-upstream-test-dist.yml b/.github/workflows/_torch-npu-upstream-test-dist.yml index 0c47d1095b..99518ea80c 100644 --- a/.github/workflows/_torch-npu-upstream-test-dist.yml +++ b/.github/workflows/_torch-npu-upstream-test-dist.yml @@ -35,7 +35,7 @@ defaults: jobs: run_tests: name: test_distributed (${{ matrix.shard }}/${{ inputs.distributed_shards }}) - runs-on: linux-aarch64-a3-16 + runs-on: linux-aarch64-a3-8 timeout-minutes: 1800 container: image: ${{ inputs.docker_image }} @@ -48,7 +48,7 @@ jobs: steps: - name: Setup NPU test environment - uses: Ascend/pytorch/.github/actions/setup-npu-test-env@v2.7.1 + uses: kerer-ai/pytorch/.github/actions/setup-npu-test-env@v2.7.1_image with: python_version: ${{ inputs.python_version }} torch_npu_wheel_artifact: ${{ inputs.torch_npu_wheel_artifact }} @@ -63,6 +63,8 @@ jobs: - name: Run distributed shard ${{ matrix.shard }}/${{ inputs.distributed_shards }} id: run_test + env: + CI: '' run: | source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true @@ -78,7 +80,7 @@ jobs: echo "=== Distributed Shard ${{ matrix.shard }} (Case-level) ===" echo "Total cases: ${TOTAL_CASES}" - echo "Runner: linux-aarch64-a3-16 (16-card NPU)" + echo "Runner: linux-aarch64-a3-8" echo "Execution mode: SERIAL" # Distributed tests: pre-collected cases, serial execution @@ -147,4 +149,25 @@ jobs: with: name: error-logs-dist-${{ matrix.shard }} path: error-logs-dist-${{ matrix.shard }}.tar.gz + retention-days: 60 + + - name: Compress /root/ascend CANN logs + if: always() + run: | + if [ -d "/root/ascend" ]; then + echo "=== Compressing /root/ascend CANN logs ===" + FILE_COUNT=$(find /root/ascend -type f | wc -l) + echo "Found ${FILE_COUNT} files under /root/ascend" + tar -czf ascend-cann-logs-dist-${{ matrix.shard }}.tar.gz -C /root ascend + echo "CANN logs compressed: $(ls -lh ascend-cann-logs-dist-${{ matrix.shard }}.tar.gz)" + else + echo "/root/ascend directory does not exist, skipping" + fi + + - name: Upload CANN logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: ascend-cann-logs-dist-${{ matrix.shard }} + path: ascend-cann-logs-dist-${{ matrix.shard }}.tar.gz retention-days: 60 \ No newline at end of file diff --git a/.github/workflows/_torch-npu-upstream-test-regular.yml b/.github/workflows/_torch-npu-upstream-test-regular.yml index a0859bab3a..d988fce1e6 100644 --- a/.github/workflows/_torch-npu-upstream-test-regular.yml +++ b/.github/workflows/_torch-npu-upstream-test-regular.yml @@ -48,7 +48,7 @@ jobs: steps: - name: Setup NPU test environment - uses: Ascend/pytorch/.github/actions/setup-npu-test-env@v2.7.1 + uses: kerer-ai/pytorch/.github/actions/setup-npu-test-env@v2.7.1_image with: python_version: ${{ inputs.python_version }} torch_npu_wheel_artifact: ${{ inputs.torch_npu_wheel_artifact }} @@ -61,8 +61,18 @@ jobs: name: cases-shards path: cases-shards + - name: Debug all environment variables + run: | + echo "=== All Environment Variables (secrets filtered) ===" + env | sort | grep -ivE \ + 'PASSWORD|PASSWD|SECRET|TOKEN|KEY|CREDENTIAL|PRIVATE|ACCESS|SIGNING|AUTH|CERT|ENC(ODE|RYPT)|SALT|NONCE|ACCOUNT|IDENTITY|LICENSE' \ + || true + echo "=== End ===" + - name: Run regular shard ${{ matrix.shard }}/${{ inputs.regular_shards }} id: run_test + env: + CI: '' run: | source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true @@ -89,7 +99,7 @@ jobs: --disabled-testcases pytorch-test-src/test_upstream/disabled_testcases.json \ --report-dir ${REPORT_DIR} \ --timeout 1200 \ - --max-workers 64 \ + --max-workers 16 \ --verbose \ 2>&1 | tee /tmp/test_shard_reg_${{ matrix.shard }}.log @@ -150,4 +160,25 @@ jobs: with: name: error-logs-reg-${{ matrix.shard }} path: error-logs-reg-${{ matrix.shard }}.tar.gz + retention-days: 60 + + - name: Compress /root/ascend CANN logs + if: always() + run: | + if [ -d "/root/ascend" ]; then + echo "=== Compressing /root/ascend CANN logs ===" + FILE_COUNT=$(find /root/ascend -type f | wc -l) + echo "Found ${FILE_COUNT} files under /root/ascend" + tar -czf ascend-cann-logs-reg-${{ matrix.shard }}.tar.gz -C /root ascend + echo "CANN logs compressed: $(ls -lh ascend-cann-logs-reg-${{ matrix.shard }}.tar.gz)" + else + echo "/root/ascend directory does not exist, skipping" + fi + + - name: Upload CANN logs + if: always() + uses: actions/upload-artifact@v4 + with: + name: ascend-cann-logs-reg-${{ matrix.shard }} + path: ascend-cann-logs-reg-${{ matrix.shard }}.tar.gz retention-days: 60 \ No newline at end of file diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml new file mode 100644 index 0000000000..5d4199a2ff --- /dev/null +++ b/.github/workflows/build-docker-images.yml @@ -0,0 +1,120 @@ +name: Build Docker Images + +on: + workflow_dispatch: + inputs: + tag: + description: 'Single image tag to build (without timestamp). Leave empty to build all.' + required: false + type: string + default: '' + push: + paths: + - .ci/docker/** + - .github/workflows/build-docker-images.yml + +env: + REGISTRY: quay.io + QUAY_ORG: kerer + IMAGE_NAME: pytorch + +jobs: + matrix: + runs-on: ubuntu-latest + outputs: + tags: ${{ steps.set.outputs.tags }} + steps: + - id: set + run: | + if [ -n "${{ inputs.tag }}" ]; then + TAGS='["${{ inputs.tag }}"]' + else + TAGS='["torch-npu-builder-x86_64-torch2.7.1","torch-npu-builder-aarch64-torch2.7.1","torch-npu-test-x86_64-cann-a1-py3.10-torch2.7.1","torch-npu-test-x86_64-cann-a2-py3.10-torch2.7.1","torch-npu-test-x86_64-cann-a3-py3.10-torch2.7.1","torch-npu-test-aarch64-cann-a1-py3.10-torch2.7.1","torch-npu-test-aarch64-cann-a2-py3.10-torch2.7.1","torch-npu-test-aarch64-cann-a3-py3.10-torch2.7.1"]' + fi + echo "tags=${TAGS}" >> $GITHUB_OUTPUT + + build: + needs: matrix + environment: QUAY_USERNAME + permissions: + contents: read + strategy: + fail-fast: false + matrix: + tag: ${{ fromJSON(needs.matrix.outputs.tags) }} + runs-on: ${{ contains(matrix.tag, 'x86_64') && 'ubuntu-latest' || 'ubuntu-22.04-arm' }} + steps: + - name: Free up disk space + run: | + sudo rm -rf /usr/local/lib/android /opt/ghc /usr/local/share/boost + sudo rm -rf /usr/share/dotnet /usr/local/share/powershell + sudo rm -rf /opt/hostedtoolcache + docker system prune -af + sudo apt clean && sudo apt autoremove -y + df -h + + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Log in to Quay.io + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ secrets.QUAY_USERNAME }} + password: ${{ secrets.QUAY_PASSWORD }} + + - name: Build and push image + run: | + TIMESTAMP=$(date -u +%Y%m%d%H%M) + cd .ci/docker + TIMESTAMP=${TIMESTAMP} ./docker_build.sh ${{ matrix.tag }} + + IMAGE_TAG="${{ matrix.tag }}-${TIMESTAMP}" + REMOTE_IMAGE="${{ env.REGISTRY }}/${{ env.QUAY_ORG }}/${{ env.IMAGE_NAME }}:${IMAGE_TAG}" + docker tag "${IMAGE_TAG}" "${REMOTE_IMAGE}" + docker push "${REMOTE_IMAGE}" + + mkdir -p /tmp/result + echo "${REMOTE_IMAGE}" > "/tmp/result/${{ matrix.tag }}.txt" + echo "Pushed ${REMOTE_IMAGE}" + + - name: Upload result + if: always() + uses: actions/upload-artifact@v4 + with: + name: result-${{ matrix.tag }} + path: /tmp/result/${{ matrix.tag }}.txt + retention-days: 1 + + summary: + needs: [matrix, build] + runs-on: ubuntu-latest + if: always() + steps: + - name: Download results + uses: actions/download-artifact@v4 + with: + pattern: result-* + path: /tmp/results + merge-multiple: true + + - name: Generate summary + run: | + echo "## Docker Image Build Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "| # | Image | Pull Command |" >> $GITHUB_STEP_SUMMARY + echo "|---|-------|-------------|" >> $GITHUB_STEP_SUMMARY + + if [ -d /tmp/results ] && [ "$(ls -A /tmp/results 2>/dev/null)" ]; then + COUNT=1 + for f in /tmp/results/*.txt; do + IMAGE=$(cat "$f") + echo "| ${COUNT} | \`${IMAGE##*:}\` | \`docker pull ${IMAGE}\` |" >> $GITHUB_STEP_SUMMARY + COUNT=$((COUNT + 1)) + done + else + echo "| - | No images built | - |" >> $GITHUB_STEP_SUMMARY + fi + + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Registry:** \`${{ env.REGISTRY }}/${{ env.QUAY_ORG }}/${{ env.IMAGE_NAME }}\`" >> $GITHUB_STEP_SUMMARY diff --git a/test_upstream/torch_env_patch.sh b/test_upstream/torch_env_patch.sh index 84d879c1c1..cba99307a3 100755 --- a/test_upstream/torch_env_patch.sh +++ b/test_upstream/torch_env_patch.sh @@ -102,4 +102,4 @@ done echo "" echo "========================================" echo "All $count patches applied successfully" -echo "========================================" \ No newline at end of file +echo "========================================"