llama vit - KrArunT/InfobellIT-Gen-AI GitHub Wiki
vLLM Deploy
docker run --runtime nvidia --gpus all \
-v ~/.cache/huggingface:/root/.cache/huggingface \
-p 8001:8000 \
-v ./models_3.2-1b:/models \
--ipc=host \
vllm/vllm-openai:latest \
vLLM Test Query
curl -X POST 'http://192.168.1.231:8001/v1/completions' -H "Content-Type: application/json" -d '{
"prompt": "What is the capital of France?",
"model": "facebook/opt-125m",
"max_tokens": 50,
"stream": true
}'