new evaluation logic

94ea5cb1 · nzy · b1db6cb1 · 94ea5cb1
Commit 94ea5cb1 authored Dec 28, 2024 by nzy
Hide whitespace changes
Inline Side-by-side

Showing with 43 additions and 48 deletions

codecritic/evaluation/apps_eval.py
+43 -48

No files found.
--- a/codecritic/evaluation/apps_eval.py
+++ b/codecritic/evaluation/apps_eval.py
@@ -6,9 +6,9 @@ import multiprocessing
 import numpy as np
 from tqdm.contrib.concurrent import process_map

+from datasets import load_dataset
+
 from codecritic.evaluation.apps_exec import run_test
-from codecritic.utils.json import save_jsonl
-from codecritic.dataset.code import extract_code

 TIMEOUT = 10

@@ -41,7 +41,7 @@ def check_correctness(sample, generation, timeout, debug=False):

 def test_generation(args, debug=False):
    apps_item, sample = args
-    code = extract_code(sample["response"][0]["content"])
+    code = sample["meta_clean_code"]

    curr_res = [-2]
    try:
@@ -68,18 +68,20 @@ def test_generation(args, debug=False):
        problem_result = np.asarray(curr_res)

    return {
-        **sample,
-        "code": code,
-        "eval_result": bool(np.all(problem_result > 0)),
-        "testcase": curr_res
+        "task_id": sample["task_id"],
+        "solution_id": sample["solution_id"],
+        "pass": bool(np.all(problem_result > 0)),
+        "timeout": bool(-1 in curr_res),
+        "compilerr": bool(-2 in curr_res),
    }


 def evaluate_code_samples(code_samples, apps):
    args = []
    for sample in code_samples:
-        problem_id = sample["problem_id"]
-        args.append((apps["test"][int(problem_id)], sample))
+        task_id = sample["task_id"]
+        split, idx = task_id.split('-')
+        args.append((apps[split][int(idx)], sample))

    cpu_num = multiprocessing.cpu_count()
    chunksize = max(len(code_samples) // (cpu_num * 5), 1)
@@ -90,7 +92,7 @@ def evaluate_code_samples(code_samples, apps):
    return results


-def evaluate_incorrect_code_samples_again(results, apps, loop_num):
+def evaluate(code_samples, apps_path):
    """
    There are some strange bugs in apps evaluation that cannot be reproduced.
    The observable issue is that the same code will yield different 'eval_result' values.
@@ -100,42 +102,35 @@ def evaluate_incorrect_code_samples_again(results, apps, loop_num):
    Run twice to verify if the result is consistent.
    The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result.
    """
-    maybe_incorrect_lst, correct_lst = [], []
-    for item in results:
-        if any(x in item["testcase"] for x in (-1, -2)):
-            maybe_incorrect_lst.append(item)
+    apps = load_dataset(apps_path)
+    all_results = []
+    for _ in range(3):
+        results = evaluate_code_samples(code_samples, apps)
+        all_results.append(results)
+
+    final_results = []
+    for lst in map(list, zip(*all_results)):
+        assert len(set(x["task_id"] for x in lst)) == 1, "Mismatched task_id"
+        assert len(set(x["solution_id"] for x in lst)) == 1, "Mismatched solution_id"
+
+        task_id, solution_id = lst[0]["task_id"], lst[0]["solution_id"]
+        
+        if all(x["compilerr"] for x in lst):
+            is_pass = False
        else:
-            correct_lst.append(item)
-
-    for _ in range(loop_num):
-        if len(maybe_incorrect_lst) == 0:
-            break
-
-        new_results = evaluate_code_samples(maybe_incorrect_lst, apps)
-        print(f"maybe incorrect lst size: {len(maybe_incorrect_lst)}")
-        check_lst = []
-        for i in range(len(new_results)):
-            old_item, new_item = maybe_incorrect_lst[i], new_results[i]
-            old_eval, new_eval = old_item["eval_result"], new_item["eval_result"]
-            if old_eval == new_eval:
-                correct_lst.append(old_item)
-            else:
-                check_lst.append(new_item)
-                print(old_item["problem_id"], old_eval, new_item["problem_id"], new_eval)
-
-        maybe_incorrect_lst = check_lst
-
-    if len(results) != len(correct_lst):
-        save_jsonl(maybe_incorrect_lst, "debug.jsonl")
-        # raise ValueError("cannot correctly evaluate codes")
-        print("cannot correctly evalute code. see debug.jsonl")
-        if len(maybe_incorrect_lst) < 5:
-            correct_lst.extend(maybe_incorrect_lst)
-
-    return correct_lst
-
-
-def evaluate(code_samples, apps):
-    results = evaluate_code_samples(code_samples, apps)
-    results = evaluate_incorrect_code_samples_again(results, apps, 10)
-    return results
+            # If there is a compilation error in any of the multiple runs, treat it as an exception and remove it.
+            lst = [x for x in lst if not x["compilerr"]]
+            is_pass = all(x["pass"] for x in lst)
+
+        final_results.append({
+            "task_id": task_id,
+            "solution_id": solution_id,
+            "pass": is_pass
+        })
+
+    for sample, is_pass in zip(code_samples, final_results):
+        assert sample["task_id"] == is_pass["task_id"], "Mismatched task_id"
+        assert sample["solution_id"] == is_pass["solution_id"], "Mismatched solution_id"
+        sample["pass"] = is_pass["pass"]
+
+    return code_samples