FusionInfer Design - fusioninfer/fusioninfer GitHub Wiki

Kubernetes CRD

ModelLoader

apiVersion: fusioninfer.io/v1alpha1
kind: ModelLoader
metadata:
  name: qwen-3
spec:
  source:
    type: huggingface
    uri: Qwen/Qwen3-8B
    # type: oss
    # uri: oss://llmaz.oss-ap-southeast-1-internal.aliyuncs.com/models/Qwen/Qwen3-8B
  loadStrategy:
    type: sidecar  # Options: sidecar, cache
    sidecar:
      name: downloader
      image: fusioninfer/modeldownloader:latest
      args:
        - --output-dir
        - /mnt/cache/qwen/Qwen3-8B/
      volumeMounts:
        - name: models
          mountPath: /mnt/cache/qwen/Qwen3-8B/
    # cache:
    #   type: fluid
    #   persistentVolumeClaim:
    #     name: qwen-3-cache

InferenceService

apiVersion: fusioninfer.io/v1alpha1
kind: InferenceService
metadata:
  name: qwen-inference-service
spec:
  modelLoaderRef:
    name: qwen-3
  roles:
    - name: gateway
      plugins:
        - type: prefix-cache-scorer
          parameters:
            blockSize: 5
            maxPrefixBlocksToMatch: 256
            lruCapacityPerServer: 31250
      schedulingProfiles:
        - name: default
          plugins:
            - pluginRef: prefix-cache-scorer
              weight: 50
    - name: prefill
      replica: 1
      multinode:
        nodeCount: 2
      template:
        metadata:
          labels:
            app: prefill
        spec:
          containers:
            - name: prefill
              image: vllm/vllm-openai:v0.11.0
              command:
                - /bin/sh
                - -c
              args:
                - |
                  python3 -m vllm.entrypoints.openai.api_server \
                  --host "0.0.0.0" \
                  --port "8000" \
                  --model $(modelLoaderRef) \
                  --served-model-name qwen3-8B \
                  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
              resources:
                limits:
                  nvidia.com/gpu: 1 # select GPU type?
    - name: decode
      replica: 1
      multinode:
        nodeCount: 2
      template:
        metadata:
          labels:
            app: decode
        spec:
          containers:
            - name: decode
              image: vllm/vllm-openai:v0.11.0
              command:
                - /bin/sh
                - -c
              args:
                - |
                  python3 -m vllm.entrypoints.openai.api_server \
                  --host "0.0.0.0" \
                  --port "8000" \
                  --model $(modelRef) \
                  --served-model-name qwen3-8B \
                  --kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
              resources:
                limits:
                  nvidia.com/gpu: 1

AutoscalingPolicy

apiVersion: fusioninfer.io/v1alpha1
kind: AutoscalingPolicy
metadata:
  name: qwen-3-autoscaling-policy
spec:
  scaleTargetRef:
    name: qwen-inference-service
  subTargetSelector:
    - roleName: prefill
      minReplicas: 2
      maxReplicas: 20
      metricsSources:
        - metricSourceType: pod
          protocolType: http
          port: "8000"
          path: /metrics
          targetMetric: "vllm:request_prefill_time_seconds"
          targetValue: "0.5"
    - roleName: decode
      minReplicas: 2
      maxReplicas: 20
      metricsSources:
        - metricSourceType: pod # or Prometheus
          protocolType: http
          port: "8000"
          path: /metrics
          targetMetric: "vllm:request_decode_time_seconds"
          targetValue: "0.05"

Research

Model Loading

Gateway

Multi-Node

Autoscaling

GPU Allocation

  • Allows selecting specific types of GPU.
  • The same GPU can be shared by multiple tasks, with each utilizing only a portion of the GPU's resources.