vllm&fastchat实践技巧 - peter-xbs/CommonCodes GitHub Wiki

1. fastchat使用

1.1 fastchat deploy

# _*_ coding:utf-8 _*_

import subprocess
import multiprocessing
import time
import fastchat.serve.gradio_web_server

def execute_command(command):
    # Execute the command
    try:
        subprocess.run(command, shell=True, check=True)
    except subprocess.CalledProcessError as e:
        print(f"Error executing command: {e}")

        # "CUDA_VISIBLE_DEVICES=3 python3 -m fastchat.serve.model_worker --model-path /mnt/vol1/sunxinbao/CODE/deepspeed-chat/ckpts/jiutian-chatml-qw0330 --port 21003 --worker-address http://localhost:21003 --controller-address http://localhost:21001" ,
        # "CUDA_VISIBLE_DEVICES=4 python3 -m fastchat.serve.model_worker --model-path /mnt/vol1/sunxinbao/CODE/deepspeed-chat/ckpts/jiutian-chatml-qw0330 --port 21004 --worker-address http://localhost:21004 --controller-address http://localhost:21001",
        # "CUDA_VISIBLE_DEVICES=5 python3 -m fastchat.serve.model_worker --model-path /mnt/vol1/sunxinbao/CODE/deepspeed-chat/ckpts/jiutian-chatml-qw0330 --port 21005 --worker-address http://localhost:21005 --controller-address http://localhost:21001" ,
        # "CUDA_VISIBLE_DEVICES=6 python3 -m fastchat.serve.model_worker --model-path /mnt/vol1/sunxinbao/CODE/deepspeed-chat/ckpts/jiutian-chatml-qw0330 --port 21006 --worker-address http://localhost:21006 --controller-address http://localhost:21001",
        # "CUDA_VISIBLE_DEVICES=0 python3 -m fastchat.serve.model_worker --model-path /mnt/vol1/sunxinbao/CODE/deepspeed-chat/ckpts/jiutian-chatml-yi0325/ --model-name jiutian-chatml-wenzhen-yi0325 --port 21007 --worker-address http://localhost:21007 --controller-address http://localhost:21001" ,
if __name__ == "__main__":
    # model_path = "/mnt/vol1/sunxinbao/CODE/deepspeed-chat/ckpts/qw0330/epoch-2"
    model_path = "/mnt/vol1/sunxinbao/CODE/deepspeed-chat/ckpts/jiutian-chatml-qw0330"
    model_name = "jiutian-chatml-qw0330"
    model_name2 = "jiutian-chatml-wenzhen-qw0330"
    base_addr = "http://localhost"
    host = "0.0.0.0"
    controller = f"{base_addr}:21001"
    cmd = "fastchat.serve.vllm_worker"
    template = "jiutian-chatml"
    template2 = "jiutian-chatml-wenzhen"

    model_name_supp = "jiutian-chatml-qw0407"
    model_path_supp = "/mnt/vol1/sunxinbao/CODE/deepspeed-chat/ckpts/qw15_1.8_0407/epoch-1"
    sleep_time=220
    scripts = [
        "python3 -m fastchat.serve.controller",
        f"CUDA_VISIBLE_DEVICES=0 python3 -m {cmd} --model-path {model_path} --model-names {model_name} --port 21008 --worker-address {base_addr}:21008 --controller-address {controller} --host {host} --conv-template {template}",
        f"CUDA_VISIBLE_DEVICES=1 python3 -m {cmd} --model-path {model_path} --model-names {model_name} --port 21007 --worker-address {base_addr}:21007 --controller-address {controller} --host {host} --conv-template {template}",
        f"CUDA_VISIBLE_DEVICES=2 python3 -m {cmd} --model-path {model_path} --model-names {model_name} --port 21006 --worker-address {base_addr}:21006 --controller-address {controller} --host {host} --conv-template {template}",
        f"CUDA_VISIBLE_DEVICES=3 python3 -m {cmd} --model-path {model_path} --model-names {model_name} --port 21005 --worker-address {base_addr}:21005 --controller-address {controller} --host {host} --conv-template {template}",
        f"CUDA_VISIBLE_DEVICES=4 python3 -m {cmd} --model-path {model_path} --model-names {model_name} --port 21004 --worker-address {base_addr}:21004 --controller-address {controller} --host {host} --conv-template {template}",
        f"CUDA_VISIBLE_DEVICES=5 python3 -m {cmd} --model-path {model_path_supp} --model-names {model_name_supp} --port 21009 --worker-address {base_addr}:21009 --controller-address {controller} --host {host} --conv-template {template}",
        "python3 -m fastchat.serve.gradio_web_server",
        "python3 -m fastchat.serve.openai_api_server --host localhost --port 8000 --api-keys jiutian"
        ]
    processes = []
    start_time = 1
    add_time = 1
    for script in scripts:
        # Create a new process for each command
        p = multiprocessing.Process(target=execute_command, args=(script,))
        processes.append(p)
    for idx ,p in enumerate(processes):
        if idx == 1:
            time.sleep(20)
        if idx == len(processes) - 2:
            time.sleep(sleep_time)
        p.start()
        # Wait for 5 seconds before starting the next process
        time.sleep(start_time)
        start_time += add_time
    for p in processes:
        p.join()

1.2 fastchat API调用

import time
from multiprocessing import Pool
from openai import OpenAI
import json
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential)
from multiprocess.pool import ThreadPool
api_key = "jiutian"
base_url = "http://127.0.0.1:8000/api/v1/"
client = OpenAI(api_key=api_key, base_url=base_url, timeout=60)

def timeit(func):
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        print(f"函数 {func.__name__} 执行耗时: {end_time - start_time} 秒")
        return result
    return wrapper

# @retry(wait=wait_random_exponential(min=1, max=2), stop=stop_after_attempt(2))
def completion_with_backoff(messages, model_name="jiutian-chatml-qw0330"):
    return client.chat.completions.create(
        model=model_name,
        messages=messages,
        temperature=0.01,
        top_p=0.9
        )

2. vllm实践

2.1 默认部署方式

python -m vllm.entrypoints.openai.api_server --model /mnt/vol1/sunxinbao/CODE/deepspeed-chat/ckpts/qw0330/epoch-2 --api-key jiutian --host 0.0.0.0 --port 8000 --served-model-name jiutian-chatml-qwen0330  --tokenizer-mode slow

针对qwen模型, vllm版本至少应为0.3.3,transformers版本至少为4.38.2

2.2 默认使用Openai接口式调用方式

from openai import OpenAI
import json
from multiprocess.pool import ThreadPool
api_key = "jiutian"
base_url = "http://localhost:8000/v1/"
client = OpenAI(api_key=api_key, base_url=base_url, timeout=1200)

def completion_with_backoff(messages, model_name="jiutian-chatml-qw0407"):
    return client.chat.completions.create(
        model=model_name,
        messages=messages,
        temperature=0.01,
        top_p=0.95,
        stop=["<|im_end|>", "<|endoftext|>"],
        stream=False,
        extra_body={
            "skip_special_tokens":False,
            "stop_token_ids": [151645, 151643],
            "repetition_penalty": 1.10
        },
        )
r = completion_with_backoff([{"role": "user", "content": "你好"}])

注意,很多参数传入需要放入extra_body字段中,参数集合如下:

### extral parameters for chat API
# best_of: Optional[int] = None
# use_beam_search: Optional[bool] = False
# top_k: Optional[int] = -1
# min_p: Optional[float] = 0.0
# repetition_penalty: Optional[float] = 1.0
# length_penalty: Optional[float] = 1.0
# early_stopping: Optional[bool] = False
# ignore_eos: Optional[bool] = False
# min_tokens: Optional[int] = 0
# stop_token_ids: Optional[List[int]] = Field(default_factory=list)
# skip_special_tokens: Optional[bool] = True
# spaces_between_special_tokens: Optional[bool] = True

2.3 自定义API方式

使用vllm.entrypoints.api_server.py,对其进行修改 原API形式为

@app.post("/generate")
async def generate(request: Request) -> Response:
    """Generate completion for the request.

    The request should be a JSON object with the following fields:
    - prompt: the prompt to use for the generation.
    - stream: whether to stream the results or not.
    - other fields: the sampling parameters (See `SamplingParams` for details).
    """
    request_dict = await request.json()
    prompt = request_dict.pop("prompt")
    stream = request_dict.pop("stream", False)
    sampling_params = SamplingParams(**request_dict)
    request_id = random_uuid()

    results_generator = engine.generate(prompt, sampling_params, request_id)

    # Streaming case
    async def stream_results() -> AsyncGenerator[bytes, None]:
        async for request_output in results_generator:
            prompt = request_output.prompt
            text_outputs = [
                prompt + output.text for output in request_output.outputs
            ]
            ret = {"text": text_outputs}
            yield (json.dumps(ret) + "\0").encode("utf-8")

    if stream:
        return StreamingResponse(stream_results())

    # Non-streaming case
    final_output = None
    async for request_output in results_generator:
        if await request.is_disconnected():
            # Abort the request if the client disconnects.
            await engine.abort(request_id)
            return Response(status_code=499)
        final_output = request_output

    assert final_output is not None
    prompt = final_output.prompt
    text_outputs = [prompt + output.text for output in final_output.outputs]
    ret = {"text": text_outputs}
    return JSONResponse(ret)

但原始generate 不支持chatml模板定义,同时对多轮支持效果较差,更改后如下:

@app.post("/generate_chatml")
async def generate(request: Request) -> Response:
    """Generate completion for the request.

    The request should be a JSON object with the following fields:
    - prompt: the prompt to use for the generation.
    - stream: whether to stream the results or not.
    - other fields: the sampling parameters (See `SamplingParams` for details).
    """
    request_dict = await request.json()
    messages = request_dict.pop("messages")
    stream = request_dict.pop("stream", False)
    sampling_params = SamplingParams(**request_dict)
    request_id = random_uuid()
    ## chatml格式拼接
    hist = []
    for msg in messages:
        role, content = msg["role"], msg["content"]
        cur = "<|im_start|>{role}\n{content}<|im_end|>"
        hist.append(cur)
    nxt = "<|im_start|>assistant\n"
    prompt = '\n'.join(hist+[nxt])
    results_generator = engine.generate(prompt, sampling_params, request_id)

    # Streaming case
    async def stream_results() -> AsyncGenerator[bytes, None]:
        async for request_output in results_generator:
            prompt = request_output.prompt
            text_outputs = [
                output.text for output in request_output.outputs
            ]
            ret = {"text": text_outputs}
            yield (json.dumps(ret, ensure_ascii=False) + "\0").encode("utf-8")

    if stream:
        return StreamingResponse(stream_results())

    # Non-streaming case
    final_output = None
    async for request_output in results_generator:
        if await request.is_disconnected():
            # Abort the request if the client disconnects.
            await engine.abort(request_id)
            return Response(status_code=499)
        final_output = request_output

    assert final_output is not None
    prompt = final_output.prompt
    text_outputs = [output.text for output in final_output.outputs]
    ret = {"text": text_outputs}
    return JSONResponse(ret)

2.4 自顶api_server调用方式

import requests
import json

url = "https://jiutian.10086.cn/kunlun/ingress/api/h3t-eeceff/fae3164b494b4d97b7011c839013c912/ai-52164f7a6ff14124aa5629bb3b743178/service-c5cce96c51844f79885ed3ac2793d850/generate_chatml"

payload = json.dumps({
  "messages": [
    {
      "role": "user",
      "content": "你好世界,欢迎来到我的世界。先介绍中国移动哈"
    },
    {
      "role": "assistant",
      "content": "你好!我是AI助手,很高兴为您介绍中国移动。\n\n中国移动是中国最大的电信运营商,也是全球最大的移动通信运营商。它提供包括语音、数据、宽带等全方位的电信服务。中国移动的核心价值观是“正德厚生,臻于至善”,致力于成为“中国移动,让生活更美好”。\n\n中国移动在世界各地都有广泛的网络覆盖,包括中国、香港、澳门、台湾、英国等地。在中国大陆,中国移动的4G和5G网络覆盖率都非常高,为用户提供快速、稳定的网络服务。\n\n中国移动还提供了许多创新的服务和产品,如移动支付、云计算、大数据分析等。这些服务和产品帮助用户更好地享受数字化生活的便利。\n\n希望这个简单的介绍能帮您了解中国移动。如果您有任何其他问题或需要更多信息,请随时告诉我。"
    },
    {
      "role": "user",
      "content": "中国移动作出了哪些领先成绩?"
    }
  ],
  "top_p": 0.9,
  "temperature": 0.1,
  "skip_special_tokens": False,
  "repetition_penalty": 1.1,
  "stop_token_ids": [
    151645,
    151643
  ],
  "max_tokens": 4096
})
headers = {
  'Content-Type': 'application/json',
  'Authorization': 'optional'
}

response = requests.request("POST", url, headers=headers, data=payload)

print(response.text)
⚠️ **GitHub.com Fallback** ⚠️