diff --git a/.github/benchmark/oot_models_accuracy.json b/.github/benchmark/oot_models_accuracy.json index acd512e0a..6eb51fba5 100644 --- a/.github/benchmark/oot_models_accuracy.json +++ b/.github/benchmark/oot_models_accuracy.json @@ -201,8 +201,9 @@ "model_path": "deepseek-ai/DeepSeek-R1-0528", "extraArgs": "--data-parallel-size 8 --enable-expert-parallel", "env_vars": "MORI_SHMEM_MODE=ISOLATION", - "runner": "linux-atom-mi35x-8", + "runner": "atom-plugin-acc-validation-runner", "test_level": "nightly", + "priority": "P1", "accuracy_threshold": 0.93, "accuracy_baseline": 0.93, "accuracy_baseline_model": "deepseek-ai/DeepSeek-R1-0528" diff --git a/.github/benchmark/sglang_models_accuracy.json b/.github/benchmark/sglang_models_accuracy.json index 09e997dc6..eb3a314ff 100644 --- a/.github/benchmark/sglang_models_accuracy.json +++ b/.github/benchmark/sglang_models_accuracy.json @@ -326,5 +326,33 @@ "accuracy_baseline": null, "accuracy_baseline_model": "Qwen/Qwen3-32B-FP8", "_baseline_note": "Threshold placeholder until MI308 gsm8k baseline is measured in CI." + }, + { + "model_name": "MI308 GLM-5.1-FP8 TP8", + "model_path": "zai-org/GLM-5.1-FP8", + "extraArgs": "--trust-remote-code --tensor-parallel-size 8 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.8 --page-size 1 --disable-radix-cache", + "env_vars": "SGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=1\nTORCHINDUCTOR_COMPILE_THREADS=128", + "runner": "atom-mi308-8gpu-plugins-benchmark", + "test_level": "nightly", + "lm_eval_num_fewshot": 20, + "lm_eval_num_concurrent": 64, + "accuracy_threshold": 0.93, + "accuracy_baseline": null, + "accuracy_baseline_model": "zai-org/GLM-5.1-FP8", + "_baseline_note": "Threshold aligned with the SGLANG accuracy validation workflow target for GLM-5.1-FP8 TP8 on MI308." + }, + { + "model_name": "MI308 DeepSeek-V4-Flash", + "model_path": "deepseek-ai/DeepSeek-V4-Flash", + "extraArgs": "--trust-remote-code --tensor-parallel-size 8 --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.9 --swa-full-tokens-ratio 0.1 --max-running-requests 256 --page-size 256 --disable-radix-cache --disable-shared-experts-fusion --tool-call-parser deepseekv4 --reasoning-parser deepseek-v4", + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nAITER_BF16_FP8_MOE_BOUND=0\nATOM_MOE_GU_ITLV=1\nSGLANG_DEFAULT_THINKING=1\nSGLANG_DSV4_REASONING_EFFORT=max\nSGLANG_USE_AITER=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nTORCHINDUCTOR_COMPILE_THREADS=128", + "runner": "atom-mi308-8gpu-plugins-benchmark", + "test_level": "nightly", + "lm_eval_num_fewshot": 5, + "lm_eval_num_concurrent": 8, + "accuracy_threshold": 0.94, + "accuracy_baseline": null, + "accuracy_baseline_model": "deepseek-ai/DeepSeek-V4-Flash", + "_baseline_note": "MI308 SGLang DeepSeek-V4-Flash coverage follows the SGLang DeepSeek-V4 5-shot GSM8K setup; refresh baseline after nightly measurements land." } ] diff --git a/.github/workflows/atom-sglang-accuracy-validation-gpu-shard.yaml b/.github/workflows/atom-sglang-accuracy-validation-gpu-shard.yaml index eac8bfe71..bbfcb2c2a 100644 --- a/.github/workflows/atom-sglang-accuracy-validation-gpu-shard.yaml +++ b/.github/workflows/atom-sglang-accuracy-validation-gpu-shard.yaml @@ -350,6 +350,8 @@ jobs: MAX_WAIT_RETRIES: "120" STREAM_SGLANG_LOGS: "1" LM_EVAL_TASK: "gsm8k" + LM_EVAL_NUM_FEWSHOT: ${{ matrix.lm_eval_num_fewshot }} + LM_EVAL_NUM_CONCURRENT: ${{ matrix.lm_eval_num_concurrent }} run: | $CONTAINER_ENGINE exec \ -e SGLANG_MODEL_NAME="${SGLANG_MODEL_NAME}" \ @@ -363,6 +365,8 @@ jobs: -e MAX_WAIT_RETRIES="${MAX_WAIT_RETRIES}" \ -e STREAM_SGLANG_LOGS="${STREAM_SGLANG_LOGS}" \ -e LM_EVAL_TASK="${LM_EVAL_TASK}" \ + -e LM_EVAL_NUM_FEWSHOT="${LM_EVAL_NUM_FEWSHOT}" \ + -e LM_EVAL_NUM_CONCURRENT="${LM_EVAL_NUM_CONCURRENT}" \ "$CONTAINER_NAME" bash -lc " set -euo pipefail bash .github/scripts/atom_sglang_test.sh accuracy diff --git a/.github/workflows/atom-sglang-accuracy-validation.yaml b/.github/workflows/atom-sglang-accuracy-validation.yaml index 78e760a94..b0844c5be 100644 --- a/.github/workflows/atom-sglang-accuracy-validation.yaml +++ b/.github/workflows/atom-sglang-accuracy-validation.yaml @@ -120,7 +120,7 @@ on: type: boolean default: false run_mi308_all: - description: "Run all MI308 gsm8k accuracy cases (Qwen on atom-mi308-8gpu-plugins-benchmark)" + description: "Run all MI308 gsm8k accuracy cases on atom-mi308-8gpu-plugins-benchmark" required: false type: boolean default: false @@ -448,6 +448,26 @@ jobs: "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=0", "runner": "atom-mi308-8gpu-plugins-benchmark", }, + { + "model_name": "MI308 GLM-5.1-FP8 TP8", + "model_path": "zai-org/GLM-5.1-FP8", + "extra_args": "--trust-remote-code --tensor-parallel-size 8 --attention-backend aiter --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.8 --page-size 1 --disable-radix-cache", + "lm_eval_num_fewshot": 20, + "lm_eval_num_concurrent": 64, + "accuracy_test_threshold": 0.93, + "env_vars": "SGLANG_AITER_FP8_PREFILL_ATTN=0\nSGLANG_USE_AITER=1\nATOM_ENABLE_DS_QKNORM_QUANT_FUSION=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nSGLANG_ENABLE_TORCH_COMPILE=1\nTORCHINDUCTOR_COMPILE_THREADS=128", + "runner": "atom-mi308-8gpu-plugins-benchmark", + }, + { + "model_name": "MI308 DeepSeek-V4-Flash", + "model_path": "deepseek-ai/DeepSeek-V4-Flash", + "extra_args": "--trust-remote-code --tensor-parallel-size 8 --kv-cache-dtype fp8_e4m3 --mem-fraction-static 0.9 --swa-full-tokens-ratio 0.1 --max-running-requests 256 --page-size 256 --disable-radix-cache --disable-shared-experts-fusion --tool-call-parser deepseekv4 --reasoning-parser deepseek-v4", + "lm_eval_num_fewshot": 5, + "lm_eval_num_concurrent": 8, + "accuracy_test_threshold": 0.94, + "env_vars": "SGLANG_DEFAULT_SERVER_ARGS=\nAITER_BF16_FP8_MOE_BOUND=0\nATOM_MOE_GU_ITLV=1\nSGLANG_DEFAULT_THINKING=1\nSGLANG_DSV4_REASONING_EFFORT=max\nSGLANG_USE_AITER=1\nSGLANG_EXTERNAL_MODEL_PACKAGE=atom.plugin.sglang.models\nTORCHINDUCTOR_COMPILE_THREADS=128", + "runner": "atom-mi308-8gpu-plugins-benchmark", + }, ] models.extend(mi308_models)