Here is the magic (assuming a 4x)...
docker run -it --rm \ --pull=always \ --ipc=host \ --network=host \ --privileged \ --cap-add=CAP_SYS_ADMIN \ --device=/dev/kfd \ --device=/dev/dri \ --device=/dev/mem \ --group-add render \ --cap-add=SYS_PTRACE \ --security-opt seccomp=unconfined \ -v /home/hotaisle:/mnt/data \ -v /root/.cache:/mnt/model \ rocm/vllm-dev:nightly mv /root/.cache /root/.cache.foo ln -s /mnt/model /root/.cache VLLM_ROCM_USE_AITER=1 vllm serve zai-org/GLM-4.7-FP8 \ --tensor-parallel-size 4 \ --kv-cache-dtype fp8 \ --quantization fp8 \ --enable-auto-tool-choice \ --tool-call-parser glm47 \ --reasoning-parser glm45 \ --load-format fastsafetensors \ --enable-expert-parallel \ --allowed-local-media-path / \ --speculative-config.method mtp \ --speculative-config.num_speculative_tokens 1 \ --mm-encoder-tp-mode data
reply
Here is the magic (assuming a 4x)...