From fea78b2efda3da5764c0b09d597b67e8a40169da Mon Sep 17 00:00:00 2001 From: Kaihui-AMD Date: Mon, 1 Jun 2026 16:46:52 +0800 Subject: [PATCH] Add AMD ROCm Docker support (RDNA3/RDNA4) --- compose.rocm.yml | 58 ++++++++++++++++++++++ docker/Dockerfile.rocm | 106 +++++++++++++++++++++++++++++++++++++++++ docs/en/install.md | 37 ++++++++++++++ 3 files changed, 201 insertions(+) create mode 100644 compose.rocm.yml create mode 100644 docker/Dockerfile.rocm diff --git a/compose.rocm.yml b/compose.rocm.yml new file mode 100644 index 00000000..cf0bfec4 --- /dev/null +++ b/compose.rocm.yml @@ -0,0 +1,58 @@ +name: fish-speech-rocm + +# AMD ROCm compose for Fish Speech (RDNA3 / RDNA4). +# Mount your checkpoints into ./checkpoints before running. +# +# docker compose -f compose.rocm.yml --profile webui up --build +# docker compose -f compose.rocm.yml --profile server up --build + +services: + webui: + build: + context: . + dockerfile: docker/Dockerfile.rocm + target: webui + image: fish-speech-webui:rocm + profiles: ["webui"] + ports: + - "${GRADIO_PORT:-7860}:7860" + volumes: + - ./checkpoints:/app/checkpoints + - ./references:/app/references + environment: + - ROCBLAS_USE_HIPBLASLT=0 + - COMPILE=${COMPILE:-1} + devices: + - /dev/kfd + - /dev/dri + group_add: + - video + - render + shm_size: "16g" + tty: true + stdin_open: true + + server: + build: + context: . + dockerfile: docker/Dockerfile.rocm + target: server + image: fish-speech-server:rocm + profiles: ["server"] + ports: + - "${API_PORT:-8080}:8080" + volumes: + - ./checkpoints:/app/checkpoints + - ./references:/app/references + environment: + - ROCBLAS_USE_HIPBLASLT=0 + - COMPILE=${COMPILE:-1} + devices: + - /dev/kfd + - /dev/dri + group_add: + - video + - render + shm_size: "16g" + tty: true + stdin_open: true diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm new file mode 100644 index 00000000..aca6fc52 --- /dev/null +++ b/docker/Dockerfile.rocm @@ -0,0 +1,106 @@ +# docker/Dockerfile.rocm +# +# Fish Speech on AMD ROCm (RDNA3 / RDNA4). +# The checkpoints are NOT bundled — mount them at /app/checkpoints. +# +# Build: +# docker build -f docker/Dockerfile.rocm --target webui -t fish-speech-webui:rocm . +# docker build -f docker/Dockerfile.rocm --target server -t fish-speech-server:rocm . +# +# Run (webui): +# docker run --device=/dev/kfd --device=/dev/dri \ +# --group-add video --group-add render \ +# -e ROCBLAS_USE_HIPBLASLT=0 \ +# -v ./checkpoints:/app/checkpoints \ +# -p 7860:7860 fish-speech-webui:rocm + +ARG ROCM_VERSION=7.2.3 +ARG BASE_IMAGE=rocm/pytorch:rocm${ROCM_VERSION}_ubuntu24.04_py3.12_pytorch_release_2.9.1 + +FROM ${BASE_IMAGE} AS app-base + +ENV DEBIAN_FRONTEND=noninteractive \ + PYTHONDONTWRITEBYTECODE=1 \ + PYTHONUNBUFFERED=1 \ + ROCBLAS_USE_HIPBLASLT=0 + +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + git ffmpeg libsox-dev build-essential cmake \ + libasound-dev portaudio19-dev libportaudio2 libportaudiocpp0 \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /app + +COPY . /app + +# Install runtime dependencies WITHOUT torch/torchaudio — the ROCm base image +# already ships a gfx-tuned torch (2.9.1+rocm7.2.3). Then install the package +# itself with --no-deps so pip does not try to pull a CUDA/CPU torch. +RUN pip install --no-cache-dir --upgrade pip setuptools wheel \ + && pip install --no-cache-dir \ + numpy "transformers<=4.57.3" datasets lightning pytorch_lightning \ + hydra-core natsort einops librosa rich "gradio>5.0.0" wandb grpcio kui \ + uvicorn loguru loralib pyrootutils resampy "einx[torch]==0.2.2" zstandard \ + pydub "modelscope==1.17.1" "opencc-python-reimplemented==0.1.7" \ + silero-vad ormsgpack tiktoken "pydantic==2.9.2" cachetools \ + descript-audio-codec safetensors soundfile vector_quantize_pytorch \ + && pip install --no-cache-dir --no-build-isolation pyaudio \ + && pip install --no-cache-dir --no-deps -e . \ + # descript-audiotools pins protobuf<3.20, but fish-speech's generated proto + # code needs >=3.20. Override after install (mirrors pyproject's uv override). + && pip install --no-cache-dir --no-deps --upgrade "protobuf>=4.25,<6.0" + +EXPOSE 7860 8080 + +# torch.compile is enabled by default (verified working on gfx1201/RDNA4). +# Set COMPILE=0 to disable. +ENV COMPILE=1 + +############################################################## +# Gradio WebUI +############################################################## +FROM app-base AS webui + +ARG GRADIO_SERVER_NAME="0.0.0.0" +ARG GRADIO_SERVER_PORT=7860 +ENV GRADIO_SERVER_NAME=${GRADIO_SERVER_NAME} \ + GRADIO_SERVER_PORT=${GRADIO_SERVER_PORT} + +RUN printf '%s\n' \ + '#!/bin/bash' \ + 'set -e' \ + 'ARGS=()' \ + 'if [ "${COMPILE:-0}" = "1" ] || [ "${COMPILE:-}" = "true" ]; then ARGS+=(--compile); fi' \ + 'exec python tools/run_webui.py \' \ + ' --llama-checkpoint-path checkpoints/s2-pro \' \ + ' --decoder-checkpoint-path checkpoints/s2-pro/codec.pth \' \ + ' --decoder-config-name modded_dac_vq "${ARGS[@]}"' \ + > /app/start_webui.sh && chmod +x /app/start_webui.sh + +ENTRYPOINT ["/app/start_webui.sh"] + +############################################################## +# API Server +############################################################## +FROM app-base AS server + +ARG API_SERVER_NAME="0.0.0.0" +ARG API_SERVER_PORT=8080 +ENV API_SERVER_NAME=${API_SERVER_NAME} \ + API_SERVER_PORT=${API_SERVER_PORT} + +RUN printf '%s\n' \ + '#!/bin/bash' \ + 'set -e' \ + 'ARGS=()' \ + 'if [ "${COMPILE:-0}" = "1" ] || [ "${COMPILE:-}" = "true" ]; then ARGS+=(--compile); fi' \ + 'exec python tools/api_server.py \' \ + ' --listen 0.0.0.0:8080 \' \ + ' --llama-checkpoint-path checkpoints/s2-pro \' \ + ' --decoder-checkpoint-path checkpoints/s2-pro/codec.pth \' \ + ' --decoder-config-name modded_dac_vq "${ARGS[@]}"' \ + > /app/start_server.sh && chmod +x /app/start_server.sh + +ENTRYPOINT ["/app/start_server.sh"] diff --git a/docs/en/install.md b/docs/en/install.md index 1d090b58..dc76474f 100644 --- a/docs/en/install.md +++ b/docs/en/install.md @@ -189,3 +189,40 @@ Both methods require mounting these directories: !!! warning GPU support requires NVIDIA Docker runtime. For CPU-only deployment, remove the `--gpus all` flag and use CPU images. + +### AMD ROCm support + +Fish Speech runs on AMD GPUs via ROCm. The ROCm image is based on the official `rocm/pytorch` image, which already ships a gfx-tuned PyTorch, so no separate torch install is needed. Verified on RDNA4 (Radeon AI PRO R9700 / gfx1201) with ROCm 7.2.3; RDNA3 (gfx1100/gfx1101) should also work. + +**Prerequisites:** + +- AMD GPU with ROCm support (RDNA3 / RDNA4) +- ROCm drivers installed on the host +- Docker with GPU passthrough (`/dev/kfd` and `/dev/dri`) + +**Using Docker Compose:** + +```bash +# WebUI +docker compose -f compose.rocm.yml --profile webui up --build + +# API server +docker compose -f compose.rocm.yml --profile server up --build +``` + +**Manual build and run:** + +```bash +docker build -f docker/Dockerfile.rocm --target webui -t fish-speech-webui:rocm . + +docker run \ + --device=/dev/kfd --device=/dev/dri \ + --group-add video --group-add render \ + -e ROCBLAS_USE_HIPBLASLT=0 \ + -v ./checkpoints:/app/checkpoints \ + -p 7860:7860 \ + fish-speech-webui:rocm +``` + +!!! note + `ROCBLAS_USE_HIPBLASLT=0` is set by default for RDNA4 (gfx1201) stability; RDNA3 users may not need it. Fish Speech uses `scaled_dot_product_attention`, which dispatches to ROCm's AOTriton flash-attention backend automatically — no custom kernel build is required. The first run is slower while MIOpen auto-tunes kernels. `torch.compile` is enabled by default (`COMPILE=1`); set `COMPILE=0` to disable.