vllm&fastchat实践技巧 - peter-xbs/CommonCodes GitHub Wiki
# _*_ coding:utf-8 _*_
import subprocess
import multiprocessing
import time
import fastchat.serve.gradio_web_server
def execute_command(command):
# Execute the command
try:
subprocess.run(command, shell=True, check=True)
except subprocess.CalledProcessError as e:
print(f"Error executing command: {e}")
# "CUDA_VISIBLE_DEVICES=3 python3 -m fastchat.serve.model_worker --model-path /mnt/vol1/sunxinbao/CODE/deepspeed-chat/ckpts/jiutian-chatml-qw0330 --port 21003 --worker-address http://localhost:21003 --controller-address http://localhost:21001" ,
# "CUDA_VISIBLE_DEVICES=4 python3 -m fastchat.serve.model_worker --model-path /mnt/vol1/sunxinbao/CODE/deepspeed-chat/ckpts/jiutian-chatml-qw0330 --port 21004 --worker-address http://localhost:21004 --controller-address http://localhost:21001",
# "CUDA_VISIBLE_DEVICES=5 python3 -m fastchat.serve.model_worker --model-path /mnt/vol1/sunxinbao/CODE/deepspeed-chat/ckpts/jiutian-chatml-qw0330 --port 21005 --worker-address http://localhost:21005 --controller-address http://localhost:21001" ,
# "CUDA_VISIBLE_DEVICES=6 python3 -m fastchat.serve.model_worker --model-path /mnt/vol1/sunxinbao/CODE/deepspeed-chat/ckpts/jiutian-chatml-qw0330 --port 21006 --worker-address http://localhost:21006 --controller-address http://localhost:21001",
# "CUDA_VISIBLE_DEVICES=0 python3 -m fastchat.serve.model_worker --model-path /mnt/vol1/sunxinbao/CODE/deepspeed-chat/ckpts/jiutian-chatml-yi0325/ --model-name jiutian-chatml-wenzhen-yi0325 --port 21007 --worker-address http://localhost:21007 --controller-address http://localhost:21001" ,
if __name__ == "__main__":
# model_path = "/mnt/vol1/sunxinbao/CODE/deepspeed-chat/ckpts/qw0330/epoch-2"
model_path = "/mnt/vol1/sunxinbao/CODE/deepspeed-chat/ckpts/jiutian-chatml-qw0330"
model_name = "jiutian-chatml-qw0330"
model_name2 = "jiutian-chatml-wenzhen-qw0330"
base_addr = "http://localhost"
host = "0.0.0.0"
controller = f"{base_addr}:21001"
cmd = "fastchat.serve.vllm_worker"
template = "jiutian-chatml"
template2 = "jiutian-chatml-wenzhen"
model_name_supp = "jiutian-chatml-qw0407"
model_path_supp = "/mnt/vol1/sunxinbao/CODE/deepspeed-chat/ckpts/qw15_1.8_0407/epoch-1"
sleep_time=220
scripts = [
"python3 -m fastchat.serve.controller",
f"CUDA_VISIBLE_DEVICES=0 python3 -m {cmd} --model-path {model_path} --model-names {model_name} --port 21008 --worker-address {base_addr}:21008 --controller-address {controller} --host {host} --conv-template {template}",
f"CUDA_VISIBLE_DEVICES=1 python3 -m {cmd} --model-path {model_path} --model-names {model_name} --port 21007 --worker-address {base_addr}:21007 --controller-address {controller} --host {host} --conv-template {template}",
f"CUDA_VISIBLE_DEVICES=2 python3 -m {cmd} --model-path {model_path} --model-names {model_name} --port 21006 --worker-address {base_addr}:21006 --controller-address {controller} --host {host} --conv-template {template}",
f"CUDA_VISIBLE_DEVICES=3 python3 -m {cmd} --model-path {model_path} --model-names {model_name} --port 21005 --worker-address {base_addr}:21005 --controller-address {controller} --host {host} --conv-template {template}",
f"CUDA_VISIBLE_DEVICES=4 python3 -m {cmd} --model-path {model_path} --model-names {model_name} --port 21004 --worker-address {base_addr}:21004 --controller-address {controller} --host {host} --conv-template {template}",
f"CUDA_VISIBLE_DEVICES=5 python3 -m {cmd} --model-path {model_path_supp} --model-names {model_name_supp} --port 21009 --worker-address {base_addr}:21009 --controller-address {controller} --host {host} --conv-template {template}",
"python3 -m fastchat.serve.gradio_web_server",
"python3 -m fastchat.serve.openai_api_server --host localhost --port 8000 --api-keys jiutian"
]
processes = []
start_time = 1
add_time = 1
for script in scripts:
# Create a new process for each command
p = multiprocessing.Process(target=execute_command, args=(script,))
processes.append(p)
for idx ,p in enumerate(processes):
if idx == 1:
time.sleep(20)
if idx == len(processes) - 2:
time.sleep(sleep_time)
p.start()
# Wait for 5 seconds before starting the next process
time.sleep(start_time)
start_time += add_time
for p in processes:
p.join()
import time
from multiprocessing import Pool
from openai import OpenAI
import json
from tenacity import (
retry,
stop_after_attempt,
wait_random_exponential)
from multiprocess.pool import ThreadPool
api_key = "jiutian"
base_url = "http://127.0.0.1:8000/api/v1/"
client = OpenAI(api_key=api_key, base_url=base_url, timeout=60)
def timeit(func):
def wrapper(*args, **kwargs):
start_time = time.time()
result = func(*args, **kwargs)
end_time = time.time()
print(f"函数 {func.__name__} 执行耗时: {end_time - start_time} 秒")
return result
return wrapper
# @retry(wait=wait_random_exponential(min=1, max=2), stop=stop_after_attempt(2))
def completion_with_backoff(messages, model_name="jiutian-chatml-qw0330"):
return client.chat.completions.create(
model=model_name,
messages=messages,
temperature=0.01,
top_p=0.9
)
python -m vllm.entrypoints.openai.api_server --model /mnt/vol1/sunxinbao/CODE/deepspeed-chat/ckpts/qw0330/epoch-2 --api-key jiutian --host 0.0.0.0 --port 8000 --served-model-name jiutian-chatml-qwen0330 --tokenizer-mode slow
针对qwen模型, vllm版本至少应为0.3.3,transformers版本至少为4.38.2
from openai import OpenAI
import json
from multiprocess.pool import ThreadPool
api_key = "jiutian"
base_url = "http://localhost:8000/v1/"
client = OpenAI(api_key=api_key, base_url=base_url, timeout=1200)
def completion_with_backoff(messages, model_name="jiutian-chatml-qw0407"):
return client.chat.completions.create(
model=model_name,
messages=messages,
temperature=0.01,
top_p=0.95,
stop=["<|im_end|>", "<|endoftext|>"],
stream=False,
extra_body={
"skip_special_tokens":False,
"stop_token_ids": [151645, 151643],
"repetition_penalty": 1.10
},
)
r = completion_with_backoff([{"role": "user", "content": "你好"}])
注意,很多参数传入需要放入extra_body字段中,参数集合如下:
### extral parameters for chat API
# best_of: Optional[int] = None
# use_beam_search: Optional[bool] = False
# top_k: Optional[int] = -1
# min_p: Optional[float] = 0.0
# repetition_penalty: Optional[float] = 1.0
# length_penalty: Optional[float] = 1.0
# early_stopping: Optional[bool] = False
# ignore_eos: Optional[bool] = False
# min_tokens: Optional[int] = 0
# stop_token_ids: Optional[List[int]] = Field(default_factory=list)
# skip_special_tokens: Optional[bool] = True
# spaces_between_special_tokens: Optional[bool] = True
使用vllm.entrypoints.api_server.py,对其进行修改 原API形式为
@app.post("/generate")
async def generate(request: Request) -> Response:
"""Generate completion for the request.
The request should be a JSON object with the following fields:
- prompt: the prompt to use for the generation.
- stream: whether to stream the results or not.
- other fields: the sampling parameters (See `SamplingParams` for details).
"""
request_dict = await request.json()
prompt = request_dict.pop("prompt")
stream = request_dict.pop("stream", False)
sampling_params = SamplingParams(**request_dict)
request_id = random_uuid()
results_generator = engine.generate(prompt, sampling_params, request_id)
# Streaming case
async def stream_results() -> AsyncGenerator[bytes, None]:
async for request_output in results_generator:
prompt = request_output.prompt
text_outputs = [
prompt + output.text for output in request_output.outputs
]
ret = {"text": text_outputs}
yield (json.dumps(ret) + "\0").encode("utf-8")
if stream:
return StreamingResponse(stream_results())
# Non-streaming case
final_output = None
async for request_output in results_generator:
if await request.is_disconnected():
# Abort the request if the client disconnects.
await engine.abort(request_id)
return Response(status_code=499)
final_output = request_output
assert final_output is not None
prompt = final_output.prompt
text_outputs = [prompt + output.text for output in final_output.outputs]
ret = {"text": text_outputs}
return JSONResponse(ret)
但原始generate 不支持chatml模板定义,同时对多轮支持效果较差,更改后如下:
@app.post("/generate_chatml")
async def generate(request: Request) -> Response:
"""Generate completion for the request.
The request should be a JSON object with the following fields:
- prompt: the prompt to use for the generation.
- stream: whether to stream the results or not.
- other fields: the sampling parameters (See `SamplingParams` for details).
"""
request_dict = await request.json()
messages = request_dict.pop("messages")
stream = request_dict.pop("stream", False)
sampling_params = SamplingParams(**request_dict)
request_id = random_uuid()
## chatml格式拼接
hist = []
for msg in messages:
role, content = msg["role"], msg["content"]
cur = "<|im_start|>{role}\n{content}<|im_end|>"
hist.append(cur)
nxt = "<|im_start|>assistant\n"
prompt = '\n'.join(hist+[nxt])
results_generator = engine.generate(prompt, sampling_params, request_id)
# Streaming case
async def stream_results() -> AsyncGenerator[bytes, None]:
async for request_output in results_generator:
prompt = request_output.prompt
text_outputs = [
output.text for output in request_output.outputs
]
ret = {"text": text_outputs}
yield (json.dumps(ret, ensure_ascii=False) + "\0").encode("utf-8")
if stream:
return StreamingResponse(stream_results())
# Non-streaming case
final_output = None
async for request_output in results_generator:
if await request.is_disconnected():
# Abort the request if the client disconnects.
await engine.abort(request_id)
return Response(status_code=499)
final_output = request_output
assert final_output is not None
prompt = final_output.prompt
text_outputs = [output.text for output in final_output.outputs]
ret = {"text": text_outputs}
return JSONResponse(ret)
import requests
import json
url = "https://jiutian.10086.cn/kunlun/ingress/api/h3t-eeceff/fae3164b494b4d97b7011c839013c912/ai-52164f7a6ff14124aa5629bb3b743178/service-c5cce96c51844f79885ed3ac2793d850/generate_chatml"
payload = json.dumps({
"messages": [
{
"role": "user",
"content": "你好世界,欢迎来到我的世界。先介绍中国移动哈"
},
{
"role": "assistant",
"content": "你好!我是AI助手,很高兴为您介绍中国移动。\n\n中国移动是中国最大的电信运营商,也是全球最大的移动通信运营商。它提供包括语音、数据、宽带等全方位的电信服务。中国移动的核心价值观是“正德厚生,臻于至善”,致力于成为“中国移动,让生活更美好”。\n\n中国移动在世界各地都有广泛的网络覆盖,包括中国、香港、澳门、台湾、英国等地。在中国大陆,中国移动的4G和5G网络覆盖率都非常高,为用户提供快速、稳定的网络服务。\n\n中国移动还提供了许多创新的服务和产品,如移动支付、云计算、大数据分析等。这些服务和产品帮助用户更好地享受数字化生活的便利。\n\n希望这个简单的介绍能帮您了解中国移动。如果您有任何其他问题或需要更多信息,请随时告诉我。"
},
{
"role": "user",
"content": "中国移动作出了哪些领先成绩?"
}
],
"top_p": 0.9,
"temperature": 0.1,
"skip_special_tokens": False,
"repetition_penalty": 1.1,
"stop_token_ids": [
151645,
151643
],
"max_tokens": 4096
})
headers = {
'Content-Type': 'application/json',
'Authorization': 'optional'
}
response = requests.request("POST", url, headers=headers, data=payload)
print(response.text)