Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 62 additions & 0 deletions benchmark/llama70b/Dockerfile.baseline
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# Baseline: the AMD production config. VLLM_ROCM_USE_AITER is left unset (vLLM
# default False), so broad aiter compute is off and the all-reduce is the
# QuickReduce-INT4 path (baked below via VLLM_ROCM_QUICK_REDUCE_QUANTIZATION). The
# exp image flips to the iris one-shot all-reduce; the A/B isolates that. Each repo
# is cloned at a fixed commit and installed; aiter's runtime kernels are prebuilt
# into the image. The container only runs the workload.
FROM rocm/vllm-dev:nightly_main_20260612

ENV MAX_JOBS=32
ARG GPU_ARCHS="gfx950" # MI350
ARG TRITON_COMMIT="6898a3288c28d50d1f4e1f91aa5867ca0d1f3c3b"
ARG VLLM_COMMIT="17ee5b1ac5dd61fa89bc4321ef54b0a790a45db3"
ARG AITER_COMMIT="706861590abd821e2390d5fa7fb875ae057dade8"
ARG IRIS_COMMIT="ff072037adf7a80532807fc25e0d090874c30075"

RUN git clone https://github.com/vllm-project/vllm.git /src/vllm \
&& cd /src/vllm && git checkout ${VLLM_COMMIT} \
&& pip install -r requirements/rocm.txt \
&& pip install . --no-build-isolation

RUN pip install "triton @ git+https://github.com/triton-lang/triton.git@${TRITON_COMMIT}"

RUN git clone --recursive https://github.com/ROCm/aiter.git /src/aiter \
&& cd /src/aiter && git checkout ${AITER_COMMIT} \
&& git submodule update --init --recursive \
&& pip install -e . --no-build-isolation

RUN git clone https://github.com/ROCm/iris.git /src/iris \
&& cd /src/iris && git checkout ${IRIS_COMMIT} \
&& pip install . --no-build-isolation

RUN pip install pytest ray tblib hf_transfer hf_xet "lm_eval[api]"

# Prebuild the aiter kernels the server loads so it never JIT-compiles at runtime.
# Imported via sys.path so aiter/__init__ (which needs a live GPU) isn't pulled in.
RUN printf '%s\n' \
"import sys" \
"sys.path.insert(0, '/src/aiter/aiter')" \
"from jit import core" \
"WANT=['module_aiter_core','module_quant','module_rmsnorm','module_rmsnorm_quant','module_custom_all_reduce']" \
"by={m['md_name']:m for m in core.get_args_of_build('all')[0]}" \
"miss=[w for w in WANT if w not in by]" \
"assert not miss, 'aiter prebuild: modules not in build set: %s' % miss" \
"for w in WANT:" \
" x=by[w]" \
" core.build_module(md_name=w, srcs=x['srcs'], flags_extra_cc=list(x['flags_extra_cc'])+['-DPREBUILD_KERNELS=2'], flags_extra_hip=list(x['flags_extra_hip'])+['-DPREBUILD_KERNELS=2'], blob_gen_cmd=x['blob_gen_cmd'], extra_include=x['extra_include'], extra_ldflags=None, verbose=False, is_python_module=True, is_standalone=False, torch_exclude=False, third_party=x['third_party'])" \
" print('[prebuild] built', w, flush=True)" \
> /tmp/prebuild_aiter.py \
&& GPU_ARCHS="${GPU_ARCHS}" python3 /tmp/prebuild_aiter.py

# Server env, baked into the image (the scripts export nothing - the image IS the
# environment). Shared by both arms: the AMD production config (aiter MHA off,
# QuickReduce INT4 all-reduce) plus operational settings (NCCL logging, RPC/ready
# timeouts). baseline bakes NO per-arm behavior flag: VLLM_ROCM_USE_AITER stays
# unset (vLLM default False) = broad aiter off, QuickReduce all-reduce.
ENV VLLM_ROCM_USE_AITER_MHA=0
ENV VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
ENV NCCL_DEBUG=INFO
ENV VLLM_RPC_TIMEOUT=1800000
ENV VLLM_ENGINE_READY_TIMEOUT_S=3600

CMD ["/bin/bash"]
65 changes: 65 additions & 0 deletions benchmark/llama70b/Dockerfile.exp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Experiment: vLLM with the iris all-reduce path (VLLM_ROCM_USE_AITER_COMMS=1).
# Each repo is cloned at a fixed commit and installed; aiter's runtime kernels are
# prebuilt into the image. The container only runs the workload.
FROM rocm/vllm-dev:nightly_main_20260612

ENV MAX_JOBS=32
ARG GPU_ARCHS="gfx950" # MI350
ARG TRITON_COMMIT="6898a3288c28d50d1f4e1f91aa5867ca0d1f3c3b"
ARG VLLM_COMMIT="4e61409d254f761994b7498b20c78c3b5ecff8ae" # micmelesse/vllm allreduce_only (make_communicator gets both cpu+device groups)
ARG AITER_COMMIT="51f6c3197f775bdf93272f94335a96f534334ed4" # ROCm/aiter micmelesse/allreduce_only (make_communicator(cpu_group, device_group, ...))
ARG IRIS_COMMIT="04ad32409251c7a7b55a39c8bae4629154e89c2c" # ROCm/iris muhaawad/one-shot-vllm

RUN git clone https://github.com/micmelesse/vllm.git /src/vllm \
&& cd /src/vllm && git checkout ${VLLM_COMMIT} \
&& pip install -r requirements/rocm.txt \
&& pip install . --no-build-isolation

RUN pip install "triton @ git+https://github.com/triton-lang/triton.git@${TRITON_COMMIT}"

RUN git clone --recursive https://github.com/ROCm/aiter.git /src/aiter \
&& cd /src/aiter && git checkout ${AITER_COMMIT} \
&& git submodule update --init --recursive \
&& pip install -e . --no-build-isolation

RUN git clone https://github.com/ROCm/iris.git /src/iris \
&& cd /src/iris && git checkout ${IRIS_COMMIT} \
&& pip install . --no-build-isolation

RUN pip install pytest ray tblib hf_transfer hf_xet "lm_eval[api]"

# Prebuild the aiter kernels the server loads so it never JIT-compiles at runtime.
# Imported via sys.path so aiter/__init__ (which needs a live GPU) isn't pulled in.
RUN printf '%s\n' \
"import sys" \
"sys.path.insert(0, '/src/aiter/aiter')" \
"from jit import core" \
"WANT=['module_aiter_core','module_quant','module_rmsnorm','module_rmsnorm_quant','module_custom_all_reduce']" \
"by={m['md_name']:m for m in core.get_args_of_build('all')[0]}" \
"miss=[w for w in WANT if w not in by]" \
"assert not miss, 'aiter prebuild: modules not in build set: %s' % miss" \
"for w in WANT:" \
" x=by[w]" \
" core.build_module(md_name=w, srcs=x['srcs'], flags_extra_cc=list(x['flags_extra_cc'])+['-DPREBUILD_KERNELS=2'], flags_extra_hip=list(x['flags_extra_hip'])+['-DPREBUILD_KERNELS=2'], blob_gen_cmd=x['blob_gen_cmd'], extra_include=x['extra_include'], extra_ldflags=None, verbose=False, is_python_module=True, is_standalone=False, torch_exclude=False, third_party=x['third_party'])" \
" print('[prebuild] built', w, flush=True)" \
> /tmp/prebuild_aiter.py \
&& GPU_ARCHS="${GPU_ARCHS}" python3 /tmp/prebuild_aiter.py

# Server env, baked into the image (the scripts export nothing - the image IS the
# environment). Shared block is IDENTICAL to Dockerfile.baseline: the AMD production
# config (aiter MHA off, QuickReduce INT4) plus operational settings.
ENV VLLM_ROCM_USE_AITER_MHA=0
ENV VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
ENV NCCL_DEBUG=INFO
ENV VLLM_RPC_TIMEOUT=1800000
ENV VLLM_ENGINE_READY_TIMEOUT_S=3600

# The per-arm behavior - the only env that differs from baseline. All three are
# required: vLLM gates the comms backend on is_comms_enabled() = USE_AITER and
# USE_AITER_COMMS, so without the master toggle iris silently falls back to
# QuickReduce; and make_communicator has no default, so the backend must be declared.
ENV VLLM_ROCM_USE_AITER=1
ENV VLLM_ROCM_USE_AITER_COMMS=1
ENV AITER_COMMS_BACKEND=iris

CMD ["/bin/bash"]
69 changes: 69 additions & 0 deletions benchmark/llama70b/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# iris all-reduce reproducer

Correctness and performance for the iris one-shot collectives on
Llama-3.3-70B-FP8 at TP=8. Two arms, each a baked image (vllm, aiter, iris pinned):

- **baseline** (`Dockerfile.baseline`) - AMD production config: broad aiter off,
the all-reduce is QuickReduce INT4.
- **exp** (`Dockerfile.exp`) - the iris one-shot all-reduce (`AITER_COMMS_BACKEND=iris`).

The A/B isolates the all-reduce path. All vLLM behavior env is baked into the images;
the scripts only run the workload against the server.

## Requirements

- 8x MI350 (gfx950)
- Docker with ROCm device access

```sh
RUN="docker run --rm -it \
--device /dev/kfd --device /dev/dri --group-add video \
--cap-add SYS_PTRACE --security-opt seccomp=unconfined \
--ipc host --network host --shm-size 16g \
-v $(pwd):/repro -w /repro"
```

## Run

Build each image once, then run the commands you need against it. The A/B is the
baseline image vs the exp image; compare their outputs.

```sh
docker build -f Dockerfile.baseline -t iris-repro:baseline .
docker build -f Dockerfile.exp -t iris-repro:exp .

$RUN iris-repro:baseline ./bench.sh # perf: serving metrics (TTFT/TPOT/E2EL/throughput)
$RUN iris-repro:exp ./bench.sh

$RUN iris-repro:baseline ./profile.sh # traces + per-kernel tables (what data.csv needs)
$RUN iris-repro:exp ./profile.sh

$RUN iris-repro:baseline ./eval.sh # gsm8k accuracy gate (correctness)
$RUN iris-repro:exp ./eval.sh

$RUN iris-repro:exp ./test.sh # iris collective correctness (exp stack, no server)
```

The four commands share one server config (`_serve.sh`) so perf, traces, and the
correctness gate all describe the same server. Each writes RAW artifacts under
`output/<arm>/`: `results/` (the workload result JSON), `profile/{summary,traces,ir}/`
(profiling run), and `arm.json` (the resolved operating point + installed code SHAs).

Default operating point is `decode64` (8192 in / 1024 out, concurrency 64), warm
(`WARMUP=64`, warmup requests excluded from the metrics). Knobs: `WORKLOAD=confluence`
(the guide's 1024/1024/conc-4 example), `WARMUP=0` (cold), `DATA=real` (ShareGPT).

## Analysis

`data.csv` is the flat, long-format extract of every arm's raw artifacts (one row per
fact: e2e metrics, profiler-table per-kernel times, trace per-kernel times). `report.ipynb`
renders the A/B from it (pandas + matplotlib only). It ships pre-built, so you can open the
notebook directly.

To rebuild it from arms you ran yourself, point `preprocess.py` at their output dirs (each
`output/<arm>-<command>/` holds `arm.json` + `results/` + `profile/`):

```sh
python preprocess.py output/* # parses each arm dir -> data.csv (reads arm.json for the labels)
jupyter nbconvert --to notebook --execute report.ipynb # or just open report.ipynb
```
Loading
Loading