export MODEL_HOME=$HOME/Models/Qwen3.6-35B-A3B-NVFP4
export MODEL_NAME=Qwen3.6-35B-A3B-NVFP4
export CUDA_HOME=/usr/local/cuda
export FLASHINFER_NVCC="$CUDA_HOME/bin/nvcc"
export FLASHINFER_CUDA_ARCH_LIST="12.0f"
export NVCC_PREPEND_FLAGS="-DCCCL_DISABLE_CTK_COMPATIBILITY_CHECK"
export LIBRARY_PATH="$CUDA_HOME/lib:$LIBRARY_PATH"
export LD_LIBRARY_PATH="$CUDA_HOME/lib:$LD_LIBRARY_PATH"
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
export VLLM_USE_FLASHINFER_MOE_FP4=0
export VLLM_USE_FLASHINFER_SAMPLER=0
--served-model-name $MODEL_NAME \
--host 0.0.0.0 --port 8082 \
--max-num-batched-tokens 384 \
--gpu-memory-utilization 0.93 \
--quantization modelopt \
--enable-chunked-prefill \
--no-enable-prefix-caching \
--no-calculate-kv-scales \
--max-cudagraph-capture-size 64 \
--attention-backend flashinfer \
--reasoning-parser qwen3 \
--enable-auto-tool-choice \
--tool-call-parser qwen3_xml \
--served-model-name $MODEL_NAME \
--host 0.0.0.0 --port 8082 \
--tensor-parallel-size 1 \
--quantization modelopt \
--attention-backend flashinfer \
--gpu-memory-utilization 0.85 \
--max-num-batched-tokens 8192 \
--enable-chunked-prefill \
--enable-prefix-caching \
--speculative-config '{"method":"mtp","num_speculative_tokens":3,"moe_backend":"triton"}'
--served-model-name $MODEL_NAME \
--host 0.0.0.0 --port 8082 \
--max-num-batched-tokens 4096 \
--gpu-memory-utilization 0.94 \
--quantization modelopt \
--enable-chunked-prefill \
--no-calculate-kv-scales \
--enable-prefix-caching \
--attention-backend flashinfer \
--reasoning-parser qwen3 \
--enable-auto-tool-choice \
--tool-call-parser qwen3_xml \