Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions .github/benchmark/oot_benchmark_models.json
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@
"prefix": "kimi-k25-mxfp4-tp4-met",
"bench_args": "",
"extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 16384 --max-model-len 16384",
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4"
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nAITER_USE_FLYDSL_MOE_SORTING=1\nAITER_AR_1STAGE_MAX_KB=512"
},
{
"tp_size": 4,
Expand All @@ -173,7 +173,7 @@
"prefix": "kimi-k2-5-mxfp4-aw-tp4",
"bench_args": "",
"extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 16384 --max-model-len 16384",
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4"
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nAITER_USE_FLYDSL_MOE_SORTING=1\nAITER_AR_1STAGE_MAX_KB=512"
},
{
"tp_size": 8,
Expand All @@ -182,7 +182,7 @@
"prefix": "kimi-k2-5-mxfp4-aw-tp8",
"bench_args": "",
"extra_args": "--trust-remote-code --tensor-parallel-size 8 --max-num-batched-tokens 16384 --max-model-len 16384",
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4"
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nAITER_USE_FLYDSL_MOE_SORTING=1\nAITER_AR_1STAGE_MAX_KB=512"
}
]
},
Expand Down Expand Up @@ -433,7 +433,7 @@
"prefix": "minimax-m2-5-aw-tp2",
"bench_args": "",
"extra_args": "--trust-remote-code --tensor-parallel-size 2 --kv-cache-dtype fp8 --max-num-batched-tokens 16384 --max-model-len 16384",
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_GLUON_PA_DECODE=1"
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_GLUON_PA_DECODE=1\nAITER_AR_1STAGE_MAX_KB=2048"
},
{
"tp_size": 4,
Expand All @@ -442,7 +442,7 @@
"prefix": "minimax-m2-5-aw-tp4",
"bench_args": "",
"extra_args": "--trust-remote-code --tensor-parallel-size 4 --kv-cache-dtype fp8 --max-num-batched-tokens 16384 --max-model-len 16384",
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_GLUON_PA_DECODE=1"
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_GLUON_PA_DECODE=1\nAITER_AR_1STAGE_MAX_KB=2048"
},
{
"tp_size": 8,
Expand All @@ -451,7 +451,7 @@
"prefix": "minimax-m2-5-aw-tp8",
"bench_args": "",
"extra_args": "--trust-remote-code --tensor-parallel-size 8 --kv-cache-dtype fp8 --max-num-batched-tokens 16384 --max-model-len 16384",
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_GLUON_PA_DECODE=1"
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_GLUON_PA_DECODE=1\nAITER_AR_1STAGE_MAX_KB=2048"
}
]
}
Expand Down
8 changes: 4 additions & 4 deletions .github/benchmark/oot_models_accuracy.json
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@
"model_name": "Kimi-K2.5-MXFP4 TP4",
"model_path": "amd/Kimi-K2.5-MXFP4-AttnFP8",
"extra_args": "--trust-remote-code --tensor-parallel-size 4 --max-num-batched-tokens 16384 --max-model-len 16384",
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4",
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nAITER_USE_FLYDSL_MOE_SORTING=1\nAITER_AR_1STAGE_MAX_KB=512",
"runner": "atom-plugin-acc-validation-runner",
"test_level": "nightly",
"priority": "P0",
Expand All @@ -175,7 +175,7 @@
"model_name": "Kimi-K2.5-MXFP4 TP8",
"model_path": "amd/Kimi-K2.5-MXFP4-AttnFP8",
"extra_args": "--trust-remote-code --tensor-parallel-size 8 --max-num-batched-tokens 16384 --max-model-len 16384",
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4",
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nAITER_USE_FLYDSL_MOE_SORTING=1\nAITER_AR_1STAGE_MAX_KB=512",
"runner": "atom-plugin-acc-validation-runner",
"test_level": "nightly",
"priority": "P1",
Expand Down Expand Up @@ -310,7 +310,7 @@
"model_name": "MiniMax-M2.5 TP2",
"model_path": "MiniMaxAI/MiniMax-M2.5",
"extra_args": "--trust-remote-code --tensor-parallel-size 2 --kv-cache-dtype fp8 --max-num-batched-tokens 16384 --max-model-len 16384",
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_GLUON_PA_DECODE=1",
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_GLUON_PA_DECODE=1\nAITER_AR_1STAGE_MAX_KB=2048",
"runner": "atom-plugin-acc-validation-runner",
"test_level": "nightly",
"priority": "P0",
Expand All @@ -320,7 +320,7 @@
"model_name": "MiniMax-M2.5 TP4",
"model_path": "MiniMaxAI/MiniMax-M2.5",
"extra_args": "--trust-remote-code --tensor-parallel-size 4 --kv-cache-dtype fp8 --max-num-batched-tokens 16384 --max-model-len 16384",
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_GLUON_PA_DECODE=1",
"env_vars": "AITER_QUICK_REDUCE_QUANTIZATION=INT4\nATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1\nATOM_USE_GLUON_PA_DECODE=1\nAITER_AR_1STAGE_MAX_KB=2048",
"runner": "atom-plugin-acc-validation-runner",
"test_level": "nightly",
"priority": "P1",
Expand Down
4 changes: 4 additions & 0 deletions recipes/atom_vllm/Kimi-K2.5.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,10 @@ We adopt [amd/Kimi-K2.5-MXFP4-AttnFP8](https://huggingface.co/amd/Kimi-K2.5-MXFP
```bash
# use quick allreduce to reduce TTFT
export AITER_QUICK_REDUCE_QUANTIZATION=INT4
# use flydsl MoE sorting kernel
export AITER_USE_FLYDSL_MOE_SORTING=1
# cap the single-stage allreduce size (KB) to favor the faster path
export AITER_AR_1STAGE_MAX_KB=512

vllm serve amd/Kimi-K2.5-MXFP4-AttnFP8 \
--host localhost \
Expand Down
2 changes: 2 additions & 0 deletions recipes/atom_vllm/MiniMax-M2.5.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ The following matches the vLLM-ATOM benchmark entries in `.github/benchmark/oot_
export AITER_QUICK_REDUCE_QUANTIZATION=INT4
export ATOM_ENABLE_QK_NORM_ROPE_CACHE_QUANT_FUSION=1
export ATOM_USE_GLUON_PA_DECODE=1
# cap the single-stage allreduce size (KB) to favor the faster path
export AITER_AR_1STAGE_MAX_KB=2048

vllm serve MiniMaxAI/MiniMax-M2.5 \
--host localhost \
Expand Down
Loading