From 559941d5814191d61922f22be4e3e4372ca3014c Mon Sep 17 00:00:00 2001 From: aflying <1550630265@qq.com> Date: Thu, 2 Apr 2026 20:38:24 +0800 Subject: [PATCH 1/4] Change runner for nightly ARM build to linux-aarch64 Change runner for nightly ARM build to linux-aarch64 --- .github/workflows/nightly-build-arm.yml | 195 ++++++++++++++++++++++++ 1 file changed, 195 insertions(+) create mode 100644 .github/workflows/nightly-build-arm.yml diff --git a/.github/workflows/nightly-build-arm.yml b/.github/workflows/nightly-build-arm.yml new file mode 100644 index 0000000000..1a2dffd2fc --- /dev/null +++ b/.github/workflows/nightly-build-arm.yml @@ -0,0 +1,195 @@ +name: Ascend/pytorch Nightly Build Validation (ARM) + +on: + push: + paths: + - '.github/workflows/nightly-build-arm.yml' + schedule: + - cron: '0 22 * * *' # UTC 22:00(北京时间次日 06:00) + - cron: '0 3 * * *' # UTC 03:00(北京时间 11:00) + - cron: '0 8 * * *' # UTC 08:00(北京时间 16:00) + workflow_dispatch: + inputs: + torch_nightly_date: + description: 'PyTorch nightly 日期 (格式: YYYYMMDD,留空使用最新版)' + required: false + default: '' + +jobs: + build: + name: Build torch_npu (ARM, PyTorch nightly) + runs-on: linux-aarch64-a3-2 + container: + image: swr.cn-north-4.myhuaweicloud.com/frameworkptadapter/pytorch_2.11.0_a2_aarch64_builder:20260331 + options: --user root + env: + PYTHON_VERSION: '3.11' + DOCKER_IMAGE: swr.cn-north-4.myhuaweicloud.com/frameworkptadapter/pytorch_2.11.0_a2_aarch64_builder:20260331 + # 跳过 auditwheel repair,避免因缺少 CANN 库(如 libccl_dpu.so)导致构建失败 + # torch_npu 依赖外部 CANN 环境,wheel 不需要自包含这些库 + AUDITWHEEL_PLAT: 'skip' + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup cache directories + run: | + mkdir -p ~/.cache/pip + mkdir -p ~/.cache/ccache + chmod -R 777 ~/.cache + + - name: Cache pip + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: pip-arm-py${{ env.PYTHON_VERSION }}-torch-nightly + restore-keys: | + pip-arm-py${{ env.PYTHON_VERSION }}- + + - name: Cache ccache + uses: actions/cache@v4 + with: + path: ~/.cache/ccache + key: ccache-arm-py${{ env.PYTHON_VERSION }}-${{ github.run_id }} + restore-keys: | + ccache-arm-py${{ env.PYTHON_VERSION }}- + + - name: Upgrade PyTorch to nightly (CPU, aarch64) + id: install_torch + run: | + # 使用镜像中的 Python 3.11 + PYTHON=python${{ env.PYTHON_VERSION }} + PIP=pip${{ env.PYTHON_VERSION }} + + # 配置 pip 使用缓存 + export PIP_CACHE_DIR=~/.cache/pip + + $PIP install --upgrade pip + + # 卸载镜像中预装的 torch 和 torchvision,避免版本冲突 + $PIP uninstall -y torch torchvision || true + + if [ -n "${{ github.event.inputs.torch_nightly_date }}" ]; then + DATE="${{ github.event.inputs.torch_nightly_date }}" + $PIP install --pre \ + "torch==2.12.0.dev${DATE}" \ + --index-url https://download.pytorch.org/whl/nightly/cpu + else + $PIP install --pre torch \ + --index-url https://download.pytorch.org/whl/nightly/cpu + fi + TORCH_VER=$($PYTHON -c "import torch; print(torch.__version__)") + echo "version=${TORCH_VER}" >> $GITHUB_OUTPUT + echo "PyTorch nightly version: ${TORCH_VER}" + + - name: Clone Ascend/pytorch (with submodules) + id: clone_repo + run: | + git clone --depth=1 --recurse-submodules \ + https://gitcode.com/Ascend/pytorch.git ascend_pytorch + cd ascend_pytorch + COMMIT=$(git rev-parse HEAD) + COMMIT_SHORT=$(git rev-parse --short HEAD) + COMMIT_DATE=$(git log -1 --format='%ci') + echo "commit=${COMMIT}" >> $GITHUB_OUTPUT + echo "commit_short=${COMMIT_SHORT}" >> $GITHUB_OUTPUT + echo "commit_date=${COMMIT_DATE}" >> $GITHUB_OUTPUT + echo "Ascend/pytorch commit: ${COMMIT} (${COMMIT_DATE})" + + - name: Install Python build dependencies + run: | + PIP=pip${{ env.PYTHON_VERSION }} + cd ascend_pytorch + # 跳过 requirements.txt 中固定的 torch 版本,使用我们安装的 nightly + # auditwheel 已通过 AUDITWHEEL_PLAT=skip 跳过,仅安装 setuptools + $PIP install setuptools + + - name: Build torch_npu wheel + id: build + run: | + PYTHON=python${{ env.PYTHON_VERSION }} + cd ascend_pytorch + + # 配置 ccache(如果可用) + if command -v ccache &> /dev/null; then + echo "ccache found, enabling ccache" + ccache -M 10G + ccache -z || true + export CC="ccache gcc" + export CXX="ccache g++" + export CCACHE_DIR=~/.cache/ccache + export CCACHE_COMPRESS=1 + export CCACHE_MAXSIZE=10G + export CCACHE_BASEDIR="${PWD}" + USE_CCACHE=1 + else + echo "ccache not found, building without cache" + USE_CCACHE=0 + fi + + # 根据 runner CPU 核心数设置并行编译数 + export MAX_JOBS=$(nproc) + + # 默认启用 torchair 构建;RPC 使用默认构建行为(不强制禁用) + export DISABLE_INSTALL_TORCHAIR=FALSE + export BUILD_WITHOUT_SHA=1 + $PYTHON setup.py build bdist_wheel 2>&1 | tee /tmp/build.log + BUILD_STATUS=${PIPESTATUS[0]} + + # 输出 ccache 统计信息 + if [ "${USE_CCACHE}" = "1" ]; then + CCACHE_STATS=$(ccache -s | grep -E "^(Hits|Misses|Cache size)" | tr '\n' ' ') + echo "hit_rate=${CCACHE_STATS}" >> $GITHUB_OUTPUT + ccache -s + fi + + echo "status=${BUILD_STATUS}" >> $GITHUB_OUTPUT + if [ ${BUILD_STATUS} -eq 0 ]; then + WHL=$(ls dist/*.whl 2>/dev/null | head -1) + echo "wheel=${WHL}" >> $GITHUB_OUTPUT + echo "Build succeeded: ${WHL}" + fi + exit ${BUILD_STATUS} + + - name: Upload build log + if: always() + uses: actions/upload-artifact@v4 + with: + name: build-log-arm-${{ github.run_number }} + path: /tmp/build.log + if-no-files-found: warn + + - name: Upload wheel + if: steps.build.outputs.status == '0' + uses: actions/upload-artifact@v4 + with: + name: torch_npu-wheel-arm-${{ github.run_number }} + path: ascend_pytorch/dist/*.whl + if-no-files-found: warn + + - name: Build summary + if: always() + run: | + BUILD_STATUS="${{ steps.build.outputs.status }}" + if [ "${BUILD_STATUS}" = "0" ]; then + STATUS_ICON="✅ SUCCESS" + else + STATUS_ICON="❌ FAILED" + fi + + cat >> $GITHUB_STEP_SUMMARY << EOF + ## Ascend/pytorch Nightly Build Validation (ARM) + + | 项目 | 详情 | + |------|------| + | 构建时间 | $(date -u '+%Y-%m-%d %H:%M UTC') | + | Docker 镜像 | \`${{ env.DOCKER_IMAGE }}\` | + | PyTorch Nightly | \`${{ steps.install_torch.outputs.version }}\` | + | Ascend/pytorch Commit | [\`${{ steps.clone_repo.outputs.commit_short }}\`](https://gitcode.com/Ascend/pytorch/commit/${{ steps.clone_repo.outputs.commit }}) | + | Commit 时间 | ${{ steps.clone_repo.outputs.commit_date }} | + | ccache 统计 | ${{ steps.build.outputs.hit_rate || 'N/A' }} | + | 构建结果 | ${STATUS_ICON} | + + $( [ "${BUILD_STATUS}" = "0" ] && echo "> Wheel: \`${{ steps.build.outputs.wheel }}\`" || echo "> 查看 build-log artifact 获取详细错误信息" ) + EOF From c7f75944d74b24636fe3149908bd0b4a71219ea7 Mon Sep 17 00:00:00 2001 From: aflying <1550630265@qq.com> Date: Thu, 2 Apr 2026 20:43:20 +0800 Subject: [PATCH 2/4] Change runner for nightly ARM build to linux-aarch64 Change runner for nightly ARM build to linux-aarch64 --- .github/workflows/nightly-build-arm.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/workflows/nightly-build-arm.yml b/.github/workflows/nightly-build-arm.yml index 1a2dffd2fc..b2ed4f22e5 100644 --- a/.github/workflows/nightly-build-arm.yml +++ b/.github/workflows/nightly-build-arm.yml @@ -1,9 +1,8 @@ name: Ascend/pytorch Nightly Build Validation (ARM) on: - push: - paths: - - '.github/workflows/nightly-build-arm.yml' + pull_request: # 监听所有PR事件 + branches: master schedule: - cron: '0 22 * * *' # UTC 22:00(北京时间次日 06:00) - cron: '0 3 * * *' # UTC 03:00(北京时间 11:00) From af88c9bf70a469008b90deb7bf75e3451d89ba71 Mon Sep 17 00:00:00 2001 From: aflying <1550630265@qq.com> Date: Thu, 2 Apr 2026 20:43:59 +0800 Subject: [PATCH 3/4] Update branches syntax in nightly build workflow Update branches syntax in nightly build workflow --- .github/workflows/nightly-build-arm.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/nightly-build-arm.yml b/.github/workflows/nightly-build-arm.yml index b2ed4f22e5..ef66a956d5 100644 --- a/.github/workflows/nightly-build-arm.yml +++ b/.github/workflows/nightly-build-arm.yml @@ -2,7 +2,7 @@ name: Ascend/pytorch Nightly Build Validation (ARM) on: pull_request: # 监听所有PR事件 - branches: master + branches: [ master ] schedule: - cron: '0 22 * * *' # UTC 22:00(北京时间次日 06:00) - cron: '0 3 * * *' # UTC 03:00(北京时间 11:00) From 9bb7b9056ab8eba879dcaf0f8872da447cb06cec Mon Sep 17 00:00:00 2001 From: wupengfei <1550630265@qq.com> Date: Thu, 2 Apr 2026 21:30:55 +0800 Subject: [PATCH 4/4] Add nightly CI workflow for torch_npu build and test This workflow builds and tests the torch_npu project, including caching, environment setup, and logging. --- .github/workflows/nightly-ci.yml | 253 +++++++++++++++++++++++++++++++ 1 file changed, 253 insertions(+) create mode 100644 .github/workflows/nightly-ci.yml diff --git a/.github/workflows/nightly-ci.yml b/.github/workflows/nightly-ci.yml new file mode 100644 index 0000000000..25a81bb0c1 --- /dev/null +++ b/.github/workflows/nightly-ci.yml @@ -0,0 +1,253 @@ +name: NPU Build and Test + +on: + push: + paths: + - '.github/workflows/npu-test.yml' + schedule: + - cron: '0 23 * * *' # UTC 23:00(北京时间次日 07:00) + workflow_dispatch: + +jobs: + build-and-test: + name: Build and Test torch_npu + runs-on: [self-hosted, npu-910b] + container: + image: swr.cn-north-4.myhuaweicloud.com/frameworkptadapter/pytorch_2.11.0_a2_aarch64_builder:20260331 + options: --user root --device /dev/davinci4 --device /dev/davinci_manager --device /dev/devmm_svm --device /dev/hisi_hdc -v /usr/local/sbin/npu-smi:/usr/local/sbin/npu-smi -v /usr/local/Ascend/driver:/usr/local/Ascend/driver -v /usr/local/Ascend/firmware:/usr/local/Ascend/firmware -v /usr/local/Ascend/cann:/usr/local/Ascend/cann -v /usr/local/Ascend/nnal:/usr/local/Ascend/nnal + env: + PYTHON_VERSION: '3.11' + DOCKER_IMAGE: swr.cn-north-4.myhuaweicloud.com/frameworkptadapter/pytorch_2.11.0_a2_aarch64_builder:20260331 + AUDITWHEEL_PLAT: 'skip' + + steps: + - name: Checkout pytorch-infra (via proxy) + shell: bash + run: | + rm -rf * .[!.]* 2>/dev/null || true + git clone --depth=1 https://gh-proxy.test.osinfra.cn/https://github.com/computing-infra/pytorch-infra.git . + echo "Repository cloned successfully" + + - name: Setup cache directories + run: | + mkdir -p ~/.cache/pip + mkdir -p ~/.cache/ccache + chmod -R 777 ~/.cache + + - name: Cache pip + uses: actions/cache@v4 + with: + path: ~/.cache/pip + key: pip-arm-py${{ env.PYTHON_VERSION }}-build-test + restore-keys: | + pip-arm-py${{ env.PYTHON_VERSION }}- + + - name: Cache ccache + uses: actions/cache@v4 + with: + path: ~/.cache/ccache + key: ccache-arm-py${{ env.PYTHON_VERSION }}-${{ github.run_id }} + restore-keys: | + ccache-arm-py${{ env.PYTHON_VERSION }}- + + - name: Uninstall pre-installed torch/torchvision + run: | + pip${{ env.PYTHON_VERSION }} uninstall -y torch torchvision || true + echo "Pre-installed torch/torchvision uninstalled" + + - name: Install PyTorch nightly + id: install_torch + run: | + PIP=pip${{ env.PYTHON_VERSION }} + PYTHON=python${{ env.PYTHON_VERSION }} + + export PIP_CACHE_DIR=~/.cache/pip + $PIP install --upgrade pip + + # 安装 PyTorch nightly 2.12.0.dev20260217(CPU 版,aarch64) + $PIP install --pre "torch==2.12.0.dev20260217" --index-url https://download.pytorch.org/whl/nightly/cpu + + TORCH_VER=$($PYTHON -c "import torch; print(torch.__version__)") + echo "version=${TORCH_VER}" >> $GITHUB_OUTPUT + echo "PyTorch nightly version: ${TORCH_VER}" + + - name: Clone Ascend/pytorch + id: clone_repo + run: | + git clone --depth=1 --recurse-submodules \ + https://gitcode.com/Ascend/pytorch.git ascend_pytorch + cd ascend_pytorch + COMMIT=$(git rev-parse HEAD) + COMMIT_SHORT=$(git rev-parse --short HEAD) + COMMIT_DATE=$(git log -1 --format='%ci') + echo "commit=${COMMIT}" >> $GITHUB_OUTPUT + echo "commit_short=${COMMIT_SHORT}" >> $GITHUB_OUTPUT + echo "commit_date=${COMMIT_DATE}" >> $GITHUB_OUTPUT + echo "Ascend/pytorch commit: ${COMMIT} (${COMMIT_DATE})" + + - name: Build torch_npu wheel + id: build + run: | + PYTHON=python${{ env.PYTHON_VERSION }} + cd ascend_pytorch + + # 配置 ccache + if command -v ccache &> /dev/null; then + echo "ccache found, enabling ccache" + ccache -M 10G + ccache -z || true + export CC="ccache gcc" + export CXX="ccache g++" + export CCACHE_DIR=~/.cache/ccache + export CCACHE_COMPRESS=1 + export CCACHE_MAXSIZE=10G + export CCACHE_BASEDIR="${PWD}" + USE_CCACHE=1 + else + echo "ccache not found, building without cache" + USE_CCACHE=0 + fi + + # 构建参数 + export MAX_JOBS=$(nproc) + export DISABLE_INSTALL_TORCHAIR=FALSE + export BUILD_WITHOUT_SHA=1 + + # 使用 ci/build.sh 脚本 + bash ci/build.sh --python=${{ env.PYTHON_VERSION }} 2>&1 | tee /tmp/build.log + BUILD_STATUS=${PIPESTATUS[0]} + + # ccache 统计 + if [ "${USE_CCACHE}" = "1" ]; then + CCACHE_STATS=$(ccache -s | grep -E "^(Hits|Misses|Cache size)" | tr '\n' ' ') + echo "ccache_stats=${CCACHE_STATS}" >> $GITHUB_OUTPUT + ccache -s + fi + + echo "status=${BUILD_STATUS}" >> $GITHUB_OUTPUT + if [ ${BUILD_STATUS} -eq 0 ]; then + WHL=$(ls dist/*.whl 2>/dev/null | head -1) + echo "wheel=${WHL}" >> $GITHUB_OUTPUT + echo "Build succeeded: ${WHL}" + fi + exit ${BUILD_STATUS} + + - name: Install torch_npu wheel + run: | + pip${{ env.PYTHON_VERSION }} install ascend_pytorch/dist/torch_npu*.whl + echo "torch_npu wheel installed" + + - name: Check Ascend paths + shell: bash + run: | + echo "=== Checking Ascend paths ===" + ls -la /usr/local/Ascend/ 2>&1 || echo "/usr/local/Ascend not found" + ls -la /usr/local/Ascend/cann/ 2>&1 || echo "/usr/local/Ascend/cann not found" + ls -la /usr/local/Ascend/nnal/ 2>&1 || echo "/usr/local/Ascend/nnal not found" + + - name: Verify NPU availability + shell: bash + run: | + # 加载 CANN 环境变量 + source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true + source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true + + # 手动设置关键环境变量 + export CANN_PATH=/usr/local/Ascend/cann + export LD_LIBRARY_PATH=$CANN_PATH/lib64:$CANN_PATH/lib64/plugin/opskernel:$CANN_PATH/lib64/plugin/nnengine:$CANN_PATH/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/aarch64:$CANN_PATH/tools/aml/lib64:$CANN_PATH/tools/aml/lib64/plugin:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/nnal/atb/latest/atb/cxx_abi_1/lib:$LD_LIBRARY_PATH + export ASCEND_HOME_PATH=$CANN_PATH + export ASCEND_OPP_PATH=$CANN_PATH/opp + + PYTHON=python${{ env.PYTHON_VERSION }} + echo "=== Testing torch_npu import ===" + $PYTHON -c "import torch; print(f'torch: {torch.__version__}'); import torch_npu; print(f'torch_npu: {torch_npu.__version__}'); print(f'NPU available: {torch.npu.is_available()}'); print(f'NPU count: {torch.npu.device_count()}'); print(f'NPU name: {torch.npu.get_device_name(0) if torch.npu.is_available() else \"N/A\"}')" + + - name: Run test_device.py + id: run_tests + shell: bash + run: | + # 加载 CANN 环境变量 + source /usr/local/Ascend/cann/set_env.sh 2>/dev/null || true + source /usr/local/Ascend/nnal/atb/set_env.sh 2>/dev/null || true + + # 手动设置关键环境变量 + export CANN_PATH=/usr/local/Ascend/cann + export LD_LIBRARY_PATH=$CANN_PATH/lib64:$CANN_PATH/lib64/plugin/opskernel:$CANN_PATH/lib64/plugin/nnengine:$CANN_PATH/opp/built-in/op_impl/ai_core/tbe/op_tiling/lib/linux/aarch64:$CANN_PATH/tools/aml/lib64:$CANN_PATH/tools/aml/lib64/plugin:/usr/local/Ascend/driver/lib64:/usr/local/Ascend/driver/lib64/common:/usr/local/Ascend/driver/lib64/driver:/usr/local/Ascend/nnal/atb/latest/atb/cxx_abi_1/lib:$LD_LIBRARY_PATH + export ASCEND_HOME_PATH=$CANN_PATH + export ASCEND_OPP_PATH=$CANN_PATH/opp + + PYTHON=python${{ env.PYTHON_VERSION }} + PIP=pip${{ env.PYTHON_VERSION }} + + # 安装 pytest + $PIP install pytest pytest-xdist + + cd ascend_pytorch/test + $PYTHON -m pytest npu/test_device.py -v 2>&1 | tee /tmp/test.log + + if [ $? -eq 0 ]; then + echo "status=0" >> $GITHUB_OUTPUT + echo "test_device.py: PASSED" + else + echo "status=1" >> $GITHUB_OUTPUT + echo "test_device.py: FAILED" + fi + + - name: Upload build log + if: always() + uses: actions/upload-artifact@v4 + with: + name: build-log-${{ github.run_number }} + path: /tmp/build.log + if-no-files-found: warn + + - name: Upload test log + if: always() + uses: actions/upload-artifact@v4 + with: + name: test-log-${{ github.run_number }} + path: /tmp/test.log + if-no-files-found: warn + + - name: Upload wheel artifact + if: steps.build.outputs.status == '0' + uses: actions/upload-artifact@v4 + with: + name: torch_npu-wheel-${{ github.run_number }} + path: ascend_pytorch/dist/*.whl + if-no-files-found: warn + + - name: Build and Test summary + if: always() + run: | + BUILD_STATUS="${{ steps.build.outputs.status }}" + TEST_STATUS="${{ steps.run_tests.outputs.status }}" + + if [ "${BUILD_STATUS}" = "0" ]; then + BUILD_ICON="✅ SUCCESS" + else + BUILD_ICON="❌ FAILED" + fi + + if [ "${TEST_STATUS}" = "0" ]; then + TEST_ICON="✅ PASSED" + else + TEST_ICON="❌ FAILED" + fi + + cat >> $GITHUB_STEP_SUMMARY << EOF + ## NPU Build and Test + + | 项目 | 详情 | + |------|------| + | 执行时间 | $(date -u '+%Y-%m-%d %H:%M UTC') | + | Docker 镜像 | \`${{ env.DOCKER_IMAGE }}\` | + | PyTorch Nightly | \`${{ steps.install_torch.outputs.version }}\` | + | Ascend/pytorch Commit | [\`${{ steps.clone_repo.outputs.commit_short }}\`](https://gitcode.com/Ascend/pytorch/commit/${{ steps.clone_repo.outputs.commit }}) | + | Commit 时间 | ${{ steps.clone_repo.outputs.commit_date }} | + | ccache 统计 | ${{ steps.build.outputs.ccache_stats || 'N/A' }} | + | 构建结果 | ${BUILD_ICON} | + | 测试结果 | ${TEST_ICON} | + + $( [ "${BUILD_STATUS}" = "0" ] && echo "> Wheel: \`${{ steps.build.outputs.wheel }}\`" || echo "> 查看 build-log artifact 获取详细错误信息" ) + EOF