Commit d66937e2 by nanziyuan

adjust performance for apps eval

parent 91cf4380
...@@ -81,10 +81,8 @@ def evaluate_code_samples(code_samples, apps): ...@@ -81,10 +81,8 @@ def evaluate_code_samples(code_samples, apps):
split, idx = task_id.split('-') split, idx = task_id.split('-')
args.append((apps[split][int(idx)], sample)) args.append((apps[split][int(idx)], sample))
cpu_num = multiprocessing.cpu_count() cpu_num = multiprocessing.cpu_count() // 2
# chunksize = max(len(code_samples) // (cpu_num * 5), 1) chunksize = max(len(code_samples) // (cpu_num * 10), 1)
chunksize = 10000
# TODO performance?
results = process_map( results = process_map(
test_generation, args, max_workers=cpu_num, chunksize=chunksize test_generation, args, max_workers=cpu_num, chunksize=chunksize
) )
...@@ -112,7 +110,7 @@ def evaluate(code_samples, apps): ...@@ -112,7 +110,7 @@ def evaluate(code_samples, apps):
assert len(set(x["solution_id"] for x in lst)) == 1, "Mismatched solution_id" assert len(set(x["solution_id"] for x in lst)) == 1, "Mismatched solution_id"
task_id, solution_id = lst[0]["task_id"], lst[0]["solution_id"] task_id, solution_id = lst[0]["task_id"], lst[0]["solution_id"]
if all(x["compilerr"] for x in lst): if all(x["compilerr"] for x in lst):
is_pass = False is_pass = False
else: else:
......
...@@ -2,16 +2,32 @@ set -xe ...@@ -2,16 +2,32 @@ set -xe
model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/" model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/"
project="/lustre/S/nanziyuan/projects/ccc" project="/lustre/S/nanziyuan/projects/ccc"
modelname="qwen25_coder_inst"
# APPS # APPS
CUDA_VISIBLE_DEVICES=0,1,2,3 \ # CUDA_VISIBLE_DEVICES=0,1,2,3 \
python -m codecritic.cli.gen_dataset \ python -m codecritic.cli.gen_dataset \
--model ${model} \ --model ${model} \
--apps /lustre/S/nanziyuan/datasets/apps/ \ --apps /lustre/S/nanziyuan/datasets/apps/ \
--train "${project}/data/train/apps_train_samples.jsonl" \ --train "${project}/data/train/${modelname}-apps-train.jsonl" \
--test "${project}/data/test/apps_test_samples.jsonl" --test "${project}/data/test/${modelname}-apps-test.jsonl"
# HumanEval & MBPP # HumanEval & MBPP
# evalplus.evaluate \
# --model ${model} \
# --n_samples 50 \
# --temperature 0.8 \
# --dataset humaneval \
# --root "${project}/data/test/${modelname}-humaneval" \
# --backend vllm
# evalplus.evaluate \
# --model ${model} \
# --n_samples 50 \
# --temperature 0.8 \
# --dataset mbpp \
# --root "${project}/data/test/${modelname}-mbpp" \
# --backend vllm
# HumanEvalPack # HumanEvalPack
......
set -xe
model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-32B-Instruct"
project="/lustre/S/nanziyuan/projects/ccc"
modelname="qwen25_coder_inst_32b"
# APPS
# CUDA_VISIBLE_DEVICES=0,1,2,3 \
python -m codecritic.cli.gen_dataset \
--model ${model} \
--apps /lustre/S/nanziyuan/datasets/apps/ \
--train "${project}/data/train/${modelname}-apps-train.jsonl" \
--test "${project}/data/test/${modelname}-apps-test.jsonl" \
--tp 2
# HumanEval & MBPP
evalplus.evaluate \
--model ${model} \
--n_samples 50 \
--temperature 0.8 \
--dataset humaneval \
--root "${project}/data/test/${modelname}-humaneval" \
--backend vllm \
--tp 2
evalplus.evaluate \
--model ${model} \
--n_samples 50 \
--temperature 0.8 \
--dataset mbpp \
--root "${project}/data/test/${modelname}-mbpp" \
--backend vllm \
--tp 2
# HumanEvalPack
# BigCodeBench
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment