Merge branch 'main' of http://62.234.201.16/nzy/codecritic

1d45c3d1 · nzy · fd1582c3 · 527f2798 · 1d45c3d1 · 1d45c3d1
Commit 1d45c3d1 authored Dec 30, 2024 by nzy
Hide whitespace changes
Inline Side-by-side

Showing with 60 additions and 10 deletions

codecritic/evaluation/apps_eval.py
+3 -5

codecritic/utils/inference.py
+0 -2

scripts/gen_dataset.sh
+19 -3

scripts/gen_dataset_32b.sh
+38 -0

No files found.
--- a/codecritic/evaluation/apps_eval.py
+++ b/codecritic/evaluation/apps_eval.py
@@ -81,10 +81,8 @@ def evaluate_code_samples(code_samples, apps):
        split, idx = task_id.split('-')
        args.append((apps[split][int(idx)], sample))

-    cpu_num = multiprocessing.cpu_count()
-    # chunksize = max(len(code_samples) // (cpu_num * 5), 1)
-    chunksize = 10000
-    # TODO performance?
+    cpu_num = multiprocessing.cpu_count() // 2
+    chunksize = max(len(code_samples) // (cpu_num * 10), 1)
    results = process_map(
        test_generation, args, max_workers=cpu_num, chunksize=chunksize
    )
@@ -112,7 +110,7 @@ def evaluate(code_samples, apps):
        assert len(set(x["solution_id"] for x in lst)) == 1, "Mismatched solution_id"

        task_id, solution_id = lst[0]["task_id"], lst[0]["solution_id"]
-        
+
        if all(x["compilerr"] for x in lst):
            is_pass = False
        else:

--- a/codecritic/utils/inference.py
+++ b/codecritic/utils/inference.py
@@ -13,7 +13,6 @@ def generate_worker(cuda_device, prompts, model_path, sampling_params):
        model=model_path,
        seed=42,
        max_model_len=8 * 1024,
-        swap_space=16,
        tensor_parallel_size=len(cuda_device),
    )

@@ -57,7 +56,6 @@ def score_worker(cuda_device, prompts, model_path, positive_token, negative_toke
        model=model_path,
        seed=42,
        max_model_len=8 * 1024,
-        swap_space=16,
        tensor_parallel_size=len(cuda_device),
    )


--- a/scripts/gen_dataset.sh
+++ b/scripts/gen_dataset.sh
@@ -2,16 +2,32 @@ set -xe

 model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/"
 project="/lustre/S/nanziyuan/projects/ccc"
+modelname="qwen25_coder_inst"

 # APPS
-CUDA_VISIBLE_DEVICES=0,1,2,3 \
+# CUDA_VISIBLE_DEVICES=0,1,2,3 \
 python -m codecritic.cli.gen_dataset \
    --model ${model} \
    --apps /lustre/S/nanziyuan/datasets/apps/ \
-    --train "${project}/data/train/apps_train_samples.jsonl" \
-    --test "${project}/data/test/apps_test_samples.jsonl"
+    --train "${project}/data/train/${modelname}-apps-train.jsonl" \
+    --test "${project}/data/test/${modelname}-apps-test.jsonl"

 # HumanEval & MBPP
+# evalplus.evaluate \
+#     --model ${model} \
+#     --n_samples 50 \
+#     --temperature 0.8 \
+#     --dataset humaneval \
+#     --root "${project}/data/test/${modelname}-humaneval" \
+#     --backend vllm
+
+# evalplus.evaluate \
+#     --model ${model} \
+#     --n_samples 50 \
+#     --temperature 0.8 \
+#     --dataset mbpp \
+#     --root "${project}/data/test/${modelname}-mbpp" \
+#     --backend vllm

 # HumanEvalPack


--- a/scripts/gen_dataset_32b.sh
+++ b/scripts/gen_dataset_32b.sh
+set -xe
+
+model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-32B-Instruct"
+project="/lustre/S/nanziyuan/projects/ccc"
+modelname="qwen25_coder_inst_32b"
+tp=4
+
+# APPS
+# CUDA_VISIBLE_DEVICES=0,1,2,3 \
+python -m codecritic.cli.gen_dataset \
+    --model ${model} \
+    --apps /lustre/S/nanziyuan/datasets/apps/ \
+    --train "${project}/data/train/${modelname}-apps-train.jsonl" \
+    --test "${project}/data/test/${modelname}-apps-test.jsonl" \
+    --tp ${tp}
+
+# HumanEval & MBPP
+evalplus.evaluate \
+    --model ${model} \
+    --n_samples 50 \
+    --temperature 0.8 \
+    --dataset humaneval \
+    --root "${project}/data/test/${modelname}-humaneval" \
+    --backend vllm \
+    --tp ${tp}
+
+evalplus.evaluate \
+    --model ${model} \
+    --n_samples 50 \
+    --temperature 0.8 \
+    --dataset mbpp \
+    --root "${project}/data/test/${modelname}-mbpp" \
+    --backend vllm \
+    --tp ${tp}
+
+# HumanEvalPack
+
+# BigCodeBench