diff --git a/.github/benchmark/oot_benchmark_models.json b/.github/benchmark/oot_benchmark_models.json index 44c933461..7ae7e8d3e 100644 --- a/.github/benchmark/oot_benchmark_models.json +++ b/.github/benchmark/oot_benchmark_models.json @@ -164,7 +164,7 @@ "prefix": "kimi-k25-mxfp4-tp4-met", "bench_args": "", "extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 16384 --max-model-len 16384", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4" + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nAITER_USE_FLYDSL_MOE_SORTING=1\nAITER_AR_1STAGE_MAX_KB=512" }, { "tp_size": 4, @@ -173,7 +173,7 @@ "prefix": "kimi-k2-5-mxfp4-aw-tp4", "bench_args": "", "extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 16384 --max-model-len 16384", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4" + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nAITER_USE_FLYDSL_MOE_SORTING=1\nAITER_AR_1STAGE_MAX_KB=512" }, { "tp_size": 8, @@ -182,7 +182,7 @@ "prefix": "kimi-k2-5-mxfp4-aw-tp8", "bench_args": "", "extra_args": "--trust-remote-code --tensor-parallel-size 8 --max-num-batched-tokens 16384 --max-model-len 16384", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4" + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nAITER_USE_FLYDSL_MOE_SORTING=1\nAITER_AR_1STAGE_MAX_KB=512" } ] }, @@ -433,7 +433,7 @@ "prefix": "minimax-m2-5-aw-tp2", "bench_args": "", "extra_args": "--trust-remote-code --tensor-parallel-size 2 --kv-cache-dtype fp8 --max-num-batched-tokens 16384 --max-model-len 16384", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_GLUON_PA_DECODE=1" + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_GLUON_PA_DECODE=1\nAITER_AR_1STAGE_MAX_KB=2048" }, { "tp_size": 4, @@ -442,7 +442,7 @@ "prefix": "minimax-m2-5-aw-tp4", "bench_args": "", "extra_args": "--trust-remote-code --tensor-parallel-size 4 --kv-cache-dtype fp8 --max-num-batched-tokens 16384 --max-model-len 16384", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_GLUON_PA_DECODE=1" + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_GLUON_PA_DECODE=1\nAITER_AR_1STAGE_MAX_KB=2048" }, { "tp_size": 8, @@ -451,7 +451,7 @@ "prefix": "minimax-m2-5-aw-tp8", "bench_args": "", "extra_args": "--trust-remote-code --tensor-parallel-size 8 --kv-cache-dtype fp8 --max-num-batched-tokens 16384 --max-model-len 16384", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_GLUON_PA_DECODE=1" + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_GLUON_PA_DECODE=1\nAITER_AR_1STAGE_MAX_KB=2048" } ] } diff --git a/.github/benchmark/oot_models_accuracy.json b/.github/benchmark/oot_models_accuracy.json index f31486da6..ba957b413 100644 --- a/.github/benchmark/oot_models_accuracy.json +++ b/.github/benchmark/oot_models_accuracy.json @@ -165,7 +165,7 @@ "model_name": "Kimi-K2.5-MXFP4 TP4", "model_path": "amd/Kimi-K2.5-MXFP4-AttnFP8", "extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 16384 --max-model-len 16384", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4", + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nAITER_USE_FLYDSL_MOE_SORTING=1\nAITER_AR_1STAGE_MAX_KB=512", "runner": "atom-plugin-acc-validation-runner", "test_level": "nightly", "priority": "P0", @@ -175,7 +175,7 @@ "model_name": "Kimi-K2.5-MXFP4 TP8", "model_path": "amd/Kimi-K2.5-MXFP4-AttnFP8", "extra_args": "--trust-remote-code --tensor-parallel-size 8 --max-num-batched-tokens 16384 --max-model-len 16384", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4", + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nAITER_USE_FLYDSL_MOE_SORTING=1\nAITER_AR_1STAGE_MAX_KB=512", "runner": "atom-plugin-acc-validation-runner", "test_level": "nightly", "priority": "P1", @@ -310,7 +310,7 @@ "model_name": "MiniMax-M2.5 TP2", "model_path": "MiniMaxAI/MiniMax-M2.5", "extra_args": "--trust-remote-code --tensor-parallel-size 2 --kv-cache-dtype fp8 --max-num-batched-tokens 16384 --max-model-len 16384", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_GLUON_PA_DECODE=1", + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_GLUON_PA_DECODE=1\nAITER_AR_1STAGE_MAX_KB=2048", "runner": "atom-plugin-acc-validation-runner", "test_level": "nightly", "priority": "P0", @@ -320,7 +320,7 @@ "model_name": "MiniMax-M2.5 TP4", "model_path": "MiniMaxAI/MiniMax-M2.5", "extra_args": "--trust-remote-code --tensor-parallel-size 4 --kv-cache-dtype fp8 --max-num-batched-tokens 16384 --max-model-len 16384", - "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_GLUON_PA_DECODE=1", + "env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_GLUON_PA_DECODE=1\nAITER_AR_1STAGE_MAX_KB=2048", "runner": "atom-plugin-acc-validation-runner", "test_level": "nightly", "priority": "P1", diff --git a/recipes/atom_vllm/Kimi-K2.5.md b/recipes/atom_vllm/Kimi-K2.5.md index 81b9567ba..9ad34fb57 100644 --- a/recipes/atom_vllm/Kimi-K2.5.md +++ b/recipes/atom_vllm/Kimi-K2.5.md @@ -19,6 +19,10 @@ We adopt [amd/Kimi-K2.5-MXFP4-AttnFP8](https://huggingface.co/amd/Kimi-K2.5-MXFP ```bash # use quick allreduce to reduce TTFT export AITER_QUICK_REDUCE_QUANTIZATION=INT4 +# use flydsl MoE sorting kernel +export AITER_USE_FLYDSL_MOE_SORTING=1 +# cap the single-stage allreduce size (KB) to favor the faster path +export AITER_AR_1STAGE_MAX_KB=512 vllm serve amd/Kimi-K2.5-MXFP4-AttnFP8 \ --host localhost \ diff --git a/recipes/atom_vllm/MiniMax-M2.5.md b/recipes/atom_vllm/MiniMax-M2.5.md index e421fcf15..1629a91ee 100644 --- a/recipes/atom_vllm/MiniMax-M2.5.md +++ b/recipes/atom_vllm/MiniMax-M2.5.md @@ -20,6 +20,8 @@ The following matches the vLLM-ATOM benchmark entries in `.github/benchmark/oot_ export AITER_QUICK_REDUCE_QUANTIZATION=INT4 export ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1 export ATOM_USE_GLUON_PA_DECODE=1 +# cap the single-stage allreduce size (KB) to favor the faster path +export AITER_AR_1STAGE_MAX_KB=2048 vllm serve MiniMaxAI/MiniMax-M2.5 \ --host localhost \