set -xe

model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-32B-Instruct"
project="/lustre/S/nanziyuan/projects/ccc"
modelname="qwen25_coder_inst_32b"
tp=4

# APPS
# CUDA_VISIBLE_DEVICES=0,1,2,3 \
python -m codecritic.cli.gen_dataset \
    --model ${model} \
    --apps /lustre/S/nanziyuan/datasets/apps/ \
    --train "${project}/data/train/${modelname}-apps-train.jsonl" \
    --test "${project}/data/test/${modelname}-apps-test.jsonl" \
    --tp ${tp}

# HumanEval & MBPP
evalplus.evaluate \
    --model ${model} \
    --n_samples 50 \
    --temperature 0.8 \
    --dataset humaneval \
    --root "${project}/data/test/${modelname}-humaneval" \
    --backend vllm \
    --tp ${tp}

evalplus.evaluate \
    --model ${model} \
    --n_samples 50 \
    --temperature 0.8 \
    --dataset mbpp \
    --root "${project}/data/test/${modelname}-mbpp" \
    --backend vllm \
    --tp ${tp}

# HumanEvalPack

# BigCodeBench
