LM‐EVAL - yiliu30/yi GitHub Wiki

--log_samples --verbosity DEBUG

export VLLM_LOGGING_LEVEL=DEBUG
timestamp=$(date +%Y%m%d-%H%M%S)
log_file=server.$timestamp.log
model_path=/home/yiliu7/models/deepseek-ai/DeepSeek-R1
model_path=/home/yliu7/workspace/inc/3rd-party/llm-compressor/examples/quantization_non_uniform/Llama-3.2-1B-Instruct-NVFP4-FP8-Dynamic
model_path="/data5/yliu7/HF_HOME/qwen_moe_skip_lm_head"
tp_size=2
ep_size=2
VLLM_USE_STANDALONE_COMPILE=1 VLLM_WORKER_MULTIPROC_METHOD=spawn  vllm serve $model_path \
    --max-model-len 8192 \
    --tensor-parallel-size $tp_size \
    --max-num-seqs 64 \
    --gpu-memory-utilization 0.6 \
    --dtype bfloat16 \
    --port 8688 \
    --enable-expert-parallel \
    --trust-remote-code  2>&1 | tee $log_file


export no_proxy="localhost, 127.0.0.1, ::1"
task_name=gsm8k
batch_size=256
# LIMIT=32
timestamp=$(date +%Y%m%d_%H%M%S)
EVAL_LOG_NAME="eval_${task_name}_${timestamp}"
max_length=8192
max_gen_toks=2048

mkdir -p benchmark_logs
model_path=/home/yliu7/workspace/inc/3rd-party/llm-compressor/examples/quantization_non_uniform/Llama-3.2-1B-Instruct-NVFP4-FP8-Dynamic
model_path="/data5/yliu7/HF_HOME/qwen_moe_skip_lm_head"
HF_ALLOW_CODE_EVAL=1 \
lm_eval --model local-completions \
    --tasks $task_name \
    --model_args model=${model_path},base_url=http://127.0.0.1:8688/v1/completions,max_concurrent=1,max_length=${max_length},max_gen_toks=${max_gen_toks} \
    --batch_size ${batch_size}  \
    --gen_kwargs="max_length=${max_length},max_gen_toks=${max_gen_toks}" \
    --confirm_run_unsafe_code \
    --log_samples \
    --output_path "benchmark_logs/$EVAL_LOG_NAME" \
    2>&1 | tee "benchmark_logs/${EVAL_LOG_NAME}.log"

use vllm scheduler

HF_ALLOW_CODE_EVAL=1 \
lm_eval --model local-chat-completions \
    --tasks $task_name \
    --apply_chat_template \
    --model_args model=${model_path},base_url=http://127.0.0.1:${port}/v1/chat/completions,num_concurrent=256,max_retries=1000,timeout=600,max_length=${max_length},max_gen_toks=${max_gen_toks} \
    --batch_size ${batch_size}  \
    --gen_kwargs="max_gen_toks=${max_gen_toks}" \
    --confirm_run_unsafe_code \
    --log_samples \
    --include_path /home/yliu7/workspace/inc/3rd-party/vllm/examples/offline_inference/basic/gpt_oss_gsm8k/ \
    --output_path "benchmark_logs/$EVAL_LOG_NAME" \
    2>&1 | tee "benchmark_logs/${EVAL_LOG_NAME}.log"

HF BACKEND

model_path="/dataset/auto-round/qwen_moe/"
taskname=gsm8k
timestamp=$(date +%Y%m%d_%H%M%S)
output_log_file_name="${taskname}_${timestamp}"
 HF_ALLOW_CODE_EVAL=1 lm_eval \
    --model hf \
    --tasks $taskname \
    --model_args "pretrained=$model_path,max_length=8192", \
    --batch_size auto \
    --limit 32 \
    --confirm_run_unsafe_code \
    --gen_kwargs="max_length=8192,max_gen_toks=2048" \
    --trust_remote_code \
    --output_path $output_log_file_name 2>&1 | tee "${output_log_file_name}.out"

Install

pip install lm-eval==0.4.8

TypeError: LLM.generate() got an unexpected keyword argument 'prompt_token_ids'

pip install lm-eval>0.4.9

Lm-eval MMMU

@mengni
vllm (pretrained=/mengni/scout_mxfp4/Llama-4-Scout-w4g32,tensor_parallel_size=2,max_model_len=8192,max_num_seqs=1024,max_gen_toks=2048,kv_cache_dtype=auto,enable_expert_parallel=True,gpu_memory_utilization=0.7), gen_kwargs: (max_gen
_toks=2048), limit: None, num_fewshot: None, batch_size: 1