qwen3 30B 24GB VRAM - mostlygeek/llama-swap GitHub Wiki

  • Uses Unsloth's Q4_K_XL quant download
  • Performance: 112 tok/sec on a single 3090
  • 19.4GB VRAM required w/ Q8 KV cache

llama-swap config:

models:
  "Q3-30B-A3B":
    aliases:
      - "qwen/qwen3-30b-a3b"
    cmd: |
      /mnt/nvme/llama-server/llama-server-latest
      --host 127.0.0.1 --port ${PORT}
      --flash-attn --metrics --slots
      --model /mnt/nvme/models/Qwen3-30B-A3B-UD-Q4_K_XL.gguf
      --cache-type-k q8_0 --cache-type-v q8_0
      --ctx-size 40960 --no-context-shift
      --temp 0.6 --min-p 0
      --top-k 20 --top-p 0.95 -ngl 99
      --jinja --reasoning-format deepseek
      --no-mmap