llama vit - KrArunT/InfobellIT-Gen-AI GitHub Wiki

vLLM Deploy

docker run --runtime nvidia --gpus all \
    -v ~/.cache/huggingface:/root/.cache/huggingface \
    -p 8001:8000 \
    -v ./models_3.2-1b:/models \
    --ipc=host \
    vllm/vllm-openai:latest \

vLLM Test Query

curl -X POST 'http://192.168.1.231:8001/v1/completions' -H "Content-Type: application/json" -d '{
 "prompt": "What is the capital of France?",
 "model": "facebook/opt-125m",
 "max_tokens": 50,
 "stream": true
}'