ROCm · micmelesse · Jun 16, 2026 · Jun 18, 2026 · Jun 18, 2026 · Jul 1, 2026
@@ -0,0 +1,62 @@
+# Baseline: the AMD production config. VLLM_ROCM_USE_AITER is left unset (vLLM
+# default False), so broad aiter compute is off and the all-reduce is the
+# QuickReduce-INT4 path (baked below via VLLM_ROCM_QUICK_REDUCE_QUANTIZATION). The
+# exp image flips to the iris one-shot all-reduce; the A/B isolates that. Each repo
+# is cloned at a fixed commit and installed; aiter's runtime kernels are prebuilt
+# into the image. The container only runs the workload.
+FROM rocm/vllm-dev:nightly_main_20260612
+
+ENV MAX_JOBS=32
+ARG GPU_ARCHS="gfx950"                                          # MI350
+ARG TRITON_COMMIT="6898a3288c28d50d1f4e1f91aa5867ca0d1f3c3b"
+ARG VLLM_COMMIT="17ee5b1ac5dd61fa89bc4321ef54b0a790a45db3"
+ARG AITER_COMMIT="706861590abd821e2390d5fa7fb875ae057dade8"
+ARG IRIS_COMMIT="ff072037adf7a80532807fc25e0d090874c30075"
+
+RUN git clone https://github.com/vllm-project/vllm.git /src/vllm \
+    && cd /src/vllm && git checkout ${VLLM_COMMIT} \
+    && pip install -r requirements/rocm.txt \
+    && pip install . --no-build-isolation
+
+RUN pip install "triton @ git+https://github.com/triton-lang/triton.git@${TRITON_COMMIT}"
+
+RUN git clone --recursive https://github.com/ROCm/aiter.git /src/aiter \
+    && cd /src/aiter && git checkout ${AITER_COMMIT} \
+    && git submodule update --init --recursive \
+    && pip install -e . --no-build-isolation
+
+RUN git clone https://github.com/ROCm/iris.git /src/iris \
+    && cd /src/iris && git checkout ${IRIS_COMMIT} \
+    && pip install . --no-build-isolation
+
+RUN pip install pytest ray tblib hf_transfer hf_xet "lm_eval[api]"
+
+# Prebuild the aiter kernels the server loads so it never JIT-compiles at runtime.
+# Imported via sys.path so aiter/__init__ (which needs a live GPU) isn't pulled in.
+RUN printf '%s\n' \
+"import sys" \
+"sys.path.insert(0, '/src/aiter/aiter')" \
+"from jit import core" \
+"WANT=['module_aiter_core','module_quant','module_rmsnorm','module_rmsnorm_quant','module_custom_all_reduce']" \
+"by={m['md_name']:m for m in core.get_args_of_build('all')[0]}" \
+"miss=[w for w in WANT if w not in by]" \
+"assert not miss, 'aiter prebuild: modules not in build set: %s' % miss" \
+"for w in WANT:" \
+"    x=by[w]" \
+"    core.build_module(md_name=w, srcs=x['srcs'], flags_extra_cc=list(x['flags_extra_cc'])+['-DPREBUILD_KERNELS=2'], flags_extra_hip=list(x['flags_extra_hip'])+['-DPREBUILD_KERNELS=2'], blob_gen_cmd=x['blob_gen_cmd'], extra_include=x['extra_include'], extra_ldflags=None, verbose=False, is_python_module=True, is_standalone=False, torch_exclude=False, third_party=x['third_party'])" \
+"    print('[prebuild] built', w, flush=True)" \
+> /tmp/prebuild_aiter.py \
+    && GPU_ARCHS="${GPU_ARCHS}" python3 /tmp/prebuild_aiter.py
+
+# Server env, baked into the image (the scripts export nothing - the image IS the
+# environment). Shared by both arms: the AMD production config (aiter MHA off,
+# QuickReduce INT4 all-reduce) plus operational settings (NCCL logging, RPC/ready
+# timeouts). baseline bakes NO per-arm behavior flag: VLLM_ROCM_USE_AITER stays
+# unset (vLLM default False) = broad aiter off, QuickReduce all-reduce.
+ENV VLLM_ROCM_USE_AITER_MHA=0
+ENV VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+ENV NCCL_DEBUG=INFO
+ENV VLLM_RPC_TIMEOUT=1800000
+ENV VLLM_ENGINE_READY_TIMEOUT_S=3600
+
+CMD ["/bin/bash"]
@@ -0,0 +1,65 @@
+# Experiment: vLLM with the iris all-reduce path (VLLM_ROCM_USE_AITER_COMMS=1).
+# Each repo is cloned at a fixed commit and installed; aiter's runtime kernels are
+# prebuilt into the image. The container only runs the workload.
+FROM rocm/vllm-dev:nightly_main_20260612
+
+ENV MAX_JOBS=32
+ARG GPU_ARCHS="gfx950"                                          # MI350
+ARG TRITON_COMMIT="6898a3288c28d50d1f4e1f91aa5867ca0d1f3c3b"
+ARG VLLM_COMMIT="4e61409d254f761994b7498b20c78c3b5ecff8ae"     # micmelesse/vllm allreduce_only (make_communicator gets both cpu+device groups)
+ARG AITER_COMMIT="51f6c3197f775bdf93272f94335a96f534334ed4"   # ROCm/aiter micmelesse/allreduce_only (make_communicator(cpu_group, device_group, ...))
+ARG IRIS_COMMIT="04ad32409251c7a7b55a39c8bae4629154e89c2c"    # ROCm/iris muhaawad/one-shot-vllm
+
+RUN git clone https://github.com/micmelesse/vllm.git /src/vllm \
+    && cd /src/vllm && git checkout ${VLLM_COMMIT} \
+    && pip install -r requirements/rocm.txt \
+    && pip install . --no-build-isolation
+
+RUN pip install "triton @ git+https://github.com/triton-lang/triton.git@${TRITON_COMMIT}"
+
+RUN git clone --recursive https://github.com/ROCm/aiter.git /src/aiter \
+    && cd /src/aiter && git checkout ${AITER_COMMIT} \
+    && git submodule update --init --recursive \
+    && pip install -e . --no-build-isolation
+
+RUN git clone https://github.com/ROCm/iris.git /src/iris \
+    && cd /src/iris && git checkout ${IRIS_COMMIT} \
+    && pip install . --no-build-isolation
+
+RUN pip install pytest ray tblib hf_transfer hf_xet "lm_eval[api]"
+
+# Prebuild the aiter kernels the server loads so it never JIT-compiles at runtime.
+# Imported via sys.path so aiter/__init__ (which needs a live GPU) isn't pulled in.
+RUN printf '%s\n' \
+"import sys" \
+"sys.path.insert(0, '/src/aiter/aiter')" \
+"from jit import core" \
+"WANT=['module_aiter_core','module_quant','module_rmsnorm','module_rmsnorm_quant','module_custom_all_reduce']" \
+"by={m['md_name']:m for m in core.get_args_of_build('all')[0]}" \
+"miss=[w for w in WANT if w not in by]" \
+"assert not miss, 'aiter prebuild: modules not in build set: %s' % miss" \
+"for w in WANT:" \
+"    x=by[w]" \
+"    core.build_module(md_name=w, srcs=x['srcs'], flags_extra_cc=list(x['flags_extra_cc'])+['-DPREBUILD_KERNELS=2'], flags_extra_hip=list(x['flags_extra_hip'])+['-DPREBUILD_KERNELS=2'], blob_gen_cmd=x['blob_gen_cmd'], extra_include=x['extra_include'], extra_ldflags=None, verbose=False, is_python_module=True, is_standalone=False, torch_exclude=False, third_party=x['third_party'])" \
+"    print('[prebuild] built', w, flush=True)" \
+> /tmp/prebuild_aiter.py \
+    && GPU_ARCHS="${GPU_ARCHS}" python3 /tmp/prebuild_aiter.py
+
+# Server env, baked into the image (the scripts export nothing - the image IS the
+# environment). Shared block is IDENTICAL to Dockerfile.baseline: the AMD production
+# config (aiter MHA off, QuickReduce INT4) plus operational settings.
+ENV VLLM_ROCM_USE_AITER_MHA=0
+ENV VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+ENV NCCL_DEBUG=INFO
+ENV VLLM_RPC_TIMEOUT=1800000
+ENV VLLM_ENGINE_READY_TIMEOUT_S=3600
+
+# The per-arm behavior - the only env that differs from baseline. All three are
+# required: vLLM gates the comms backend on is_comms_enabled() = USE_AITER and
+# USE_AITER_COMMS, so without the master toggle iris silently falls back to
+# QuickReduce; and make_communicator has no default, so the backend must be declared.
+ENV VLLM_ROCM_USE_AITER=1
+ENV VLLM_ROCM_USE_AITER_COMMS=1
+ENV AITER_COMMS_BACKEND=iris
+
+CMD ["/bin/bash"]
@@ -0,0 +1,69 @@
+# iris all-reduce reproducer
+
+Correctness and performance for the iris one-shot collectives on
+Llama-3.3-70B-FP8 at TP=8. Two arms, each a baked image (vllm, aiter, iris pinned):
+
+- **baseline** (`Dockerfile.baseline`) - AMD production config: broad aiter off,
+  the all-reduce is QuickReduce INT4.
+- **exp** (`Dockerfile.exp`) - the iris one-shot all-reduce (`AITER_COMMS_BACKEND=iris`).
+
+The A/B isolates the all-reduce path. All vLLM behavior env is baked into the images;
+the scripts only run the workload against the server.
+
+## Requirements
+
+- 8x MI350 (gfx950)
+- Docker with ROCm device access
+
+```sh
+RUN="docker run --rm -it \
+  --device /dev/kfd --device /dev/dri --group-add video \
+  --cap-add SYS_PTRACE --security-opt seccomp=unconfined \
+  --ipc host --network host --shm-size 16g \
+  -v $(pwd):/repro -w /repro"
+```
+
+## Run
+
+Build each image once, then run the commands you need against it. The A/B is the
+baseline image vs the exp image; compare their outputs.
+
+```sh
+docker build -f Dockerfile.baseline -t iris-repro:baseline .
+docker build -f Dockerfile.exp      -t iris-repro:exp      .
+
+$RUN iris-repro:baseline ./bench.sh     # perf: serving metrics (TTFT/TPOT/E2EL/throughput)
+$RUN iris-repro:exp      ./bench.sh
+
+$RUN iris-repro:baseline ./profile.sh   # traces + per-kernel tables (what data.csv needs)
+$RUN iris-repro:exp      ./profile.sh
+
+$RUN iris-repro:baseline ./eval.sh      # gsm8k accuracy gate (correctness)
+$RUN iris-repro:exp      ./eval.sh
+
+$RUN iris-repro:exp      ./test.sh      # iris collective correctness (exp stack, no server)
+```
+
+The four commands share one server config (`_serve.sh`) so perf, traces, and the
+correctness gate all describe the same server. Each writes RAW artifacts under
+`output/<arm>/`: `results/` (the workload result JSON), `profile/{summary,traces,ir}/`
+(profiling run), and `arm.json` (the resolved operating point + installed code SHAs).
+
+Default operating point is `decode64` (8192 in / 1024 out, concurrency 64), warm
+(`WARMUP=64`, warmup requests excluded from the metrics). Knobs: `WORKLOAD=confluence`
+(the guide's 1024/1024/conc-4 example), `WARMUP=0` (cold), `DATA=real` (ShareGPT).
+
+## Analysis
+
+`data.csv` is the flat, long-format extract of every arm's raw artifacts (one row per
+fact: e2e metrics, profiler-table per-kernel times, trace per-kernel times). `report.ipynb`
+renders the A/B from it (pandas + matplotlib only). It ships pre-built, so you can open the
+notebook directly.
+
+To rebuild it from arms you ran yourself, point `preprocess.py` at their output dirs (each
+`output/<arm>-<command>/` holds `arm.json` + `results/` + `profile/`):
+
+```sh
+python preprocess.py output/*     # parses each arm dir -> data.csv (reads arm.json for the labels)
+jupyter nbconvert --to notebook --execute report.ipynb   # or just open report.ipynb
+```