FusionInfer Design - fusioninfer/fusioninfer GitHub Wiki
Kubernetes CRD
ModelLoader
apiVersion: fusioninfer.io/v1alpha1
kind: ModelLoader
metadata:
name: qwen-3
spec:
source:
type: huggingface
uri: Qwen/Qwen3-8B
# type: oss
# uri: oss://llmaz.oss-ap-southeast-1-internal.aliyuncs.com/models/Qwen/Qwen3-8B
loadStrategy:
type: sidecar # Options: sidecar, cache
sidecar:
name: downloader
image: fusioninfer/modeldownloader:latest
args:
- --output-dir
- /mnt/cache/qwen/Qwen3-8B/
volumeMounts:
- name: models
mountPath: /mnt/cache/qwen/Qwen3-8B/
# cache:
# type: fluid
# persistentVolumeClaim:
# name: qwen-3-cache
InferenceService
apiVersion: fusioninfer.io/v1alpha1
kind: InferenceService
metadata:
name: qwen-inference-service
spec:
modelLoaderRef:
name: qwen-3
roles:
- name: gateway
plugins:
- type: prefix-cache-scorer
parameters:
blockSize: 5
maxPrefixBlocksToMatch: 256
lruCapacityPerServer: 31250
schedulingProfiles:
- name: default
plugins:
- pluginRef: prefix-cache-scorer
weight: 50
- name: prefill
replica: 1
multinode:
nodeCount: 2
template:
metadata:
labels:
app: prefill
spec:
containers:
- name: prefill
image: vllm/vllm-openai:v0.11.0
command:
- /bin/sh
- -c
args:
- |
python3 -m vllm.entrypoints.openai.api_server \
--host "0.0.0.0" \
--port "8000" \
--model $(modelLoaderRef) \
--served-model-name qwen3-8B \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
resources:
limits:
nvidia.com/gpu: 1 # select GPU type?
- name: decode
replica: 1
multinode:
nodeCount: 2
template:
metadata:
labels:
app: decode
spec:
containers:
- name: decode
image: vllm/vllm-openai:v0.11.0
command:
- /bin/sh
- -c
args:
- |
python3 -m vllm.entrypoints.openai.api_server \
--host "0.0.0.0" \
--port "8000" \
--model $(modelRef) \
--served-model-name qwen3-8B \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}'
resources:
limits:
nvidia.com/gpu: 1
AutoscalingPolicy
apiVersion: fusioninfer.io/v1alpha1
kind: AutoscalingPolicy
metadata:
name: qwen-3-autoscaling-policy
spec:
scaleTargetRef:
name: qwen-inference-service
subTargetSelector:
- roleName: prefill
minReplicas: 2
maxReplicas: 20
metricsSources:
- metricSourceType: pod
protocolType: http
port: "8000"
path: /metrics
targetMetric: "vllm:request_prefill_time_seconds"
targetValue: "0.5"
- roleName: decode
minReplicas: 2
maxReplicas: 20
metricsSources:
- metricSourceType: pod # or Prometheus
protocolType: http
port: "8000"
path: /metrics
targetMetric: "vllm:request_decode_time_seconds"
targetValue: "0.05"
Research
Model Loading
Gateway
Multi-Node
Autoscaling
GPU Allocation
- Allows selecting specific types of GPU.
- The same GPU can be shared by multiple tasks, with each utilizing only a portion of the GPU's resources.