Commit 94ea5cb1 by nzy

new evaluation logic

parent b1db6cb1
......@@ -6,9 +6,9 @@ import multiprocessing
import numpy as np
from tqdm.contrib.concurrent import process_map
from datasets import load_dataset
from codecritic.evaluation.apps_exec import run_test
from codecritic.utils.json import save_jsonl
from codecritic.dataset.code import extract_code
TIMEOUT = 10
......@@ -41,7 +41,7 @@ def check_correctness(sample, generation, timeout, debug=False):
def test_generation(args, debug=False):
apps_item, sample = args
code = extract_code(sample["response"][0]["content"])
code = sample["meta_clean_code"]
curr_res = [-2]
try:
......@@ -68,18 +68,20 @@ def test_generation(args, debug=False):
problem_result = np.asarray(curr_res)
return {
**sample,
"code": code,
"eval_result": bool(np.all(problem_result > 0)),
"testcase": curr_res
"task_id": sample["task_id"],
"solution_id": sample["solution_id"],
"pass": bool(np.all(problem_result > 0)),
"timeout": bool(-1 in curr_res),
"compilerr": bool(-2 in curr_res),
}
def evaluate_code_samples(code_samples, apps):
args = []
for sample in code_samples:
problem_id = sample["problem_id"]
args.append((apps["test"][int(problem_id)], sample))
task_id = sample["task_id"]
split, idx = task_id.split('-')
args.append((apps[split][int(idx)], sample))
cpu_num = multiprocessing.cpu_count()
chunksize = max(len(code_samples) // (cpu_num * 5), 1)
......@@ -90,7 +92,7 @@ def evaluate_code_samples(code_samples, apps):
return results
def evaluate_incorrect_code_samples_again(results, apps, loop_num):
def evaluate(code_samples, apps_path):
"""
There are some strange bugs in apps evaluation that cannot be reproduced.
The observable issue is that the same code will yield different 'eval_result' values.
......@@ -100,42 +102,35 @@ def evaluate_incorrect_code_samples_again(results, apps, loop_num):
Run twice to verify if the result is consistent.
The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result.
"""
maybe_incorrect_lst, correct_lst = [], []
for item in results:
if any(x in item["testcase"] for x in (-1, -2)):
maybe_incorrect_lst.append(item)
apps = load_dataset(apps_path)
all_results = []
for _ in range(3):
results = evaluate_code_samples(code_samples, apps)
all_results.append(results)
final_results = []
for lst in map(list, zip(*all_results)):
assert len(set(x["task_id"] for x in lst)) == 1, "Mismatched task_id"
assert len(set(x["solution_id"] for x in lst)) == 1, "Mismatched solution_id"
task_id, solution_id = lst[0]["task_id"], lst[0]["solution_id"]
if all(x["compilerr"] for x in lst):
is_pass = False
else:
correct_lst.append(item)
for _ in range(loop_num):
if len(maybe_incorrect_lst) == 0:
break
new_results = evaluate_code_samples(maybe_incorrect_lst, apps)
print(f"maybe incorrect lst size: {len(maybe_incorrect_lst)}")
check_lst = []
for i in range(len(new_results)):
old_item, new_item = maybe_incorrect_lst[i], new_results[i]
old_eval, new_eval = old_item["eval_result"], new_item["eval_result"]
if old_eval == new_eval:
correct_lst.append(old_item)
else:
check_lst.append(new_item)
print(old_item["problem_id"], old_eval, new_item["problem_id"], new_eval)
maybe_incorrect_lst = check_lst
if len(results) != len(correct_lst):
save_jsonl(maybe_incorrect_lst, "debug.jsonl")
# raise ValueError("cannot correctly evaluate codes")
print("cannot correctly evalute code. see debug.jsonl")
if len(maybe_incorrect_lst) < 5:
correct_lst.extend(maybe_incorrect_lst)
return correct_lst
def evaluate(code_samples, apps):
results = evaluate_code_samples(code_samples, apps)
results = evaluate_incorrect_code_samples_again(results, apps, 10)
return results
# If there is a compilation error in any of the multiple runs, treat it as an exception and remove it.
lst = [x for x in lst if not x["compilerr"]]
is_pass = all(x["pass"] for x in lst)
final_results.append({
"task_id": task_id,
"solution_id": solution_id,
"pass": is_pass
})
for sample, is_pass in zip(code_samples, final_results):
assert sample["task_id"] == is_pass["task_id"], "Mismatched task_id"
assert sample["solution_id"] == is_pass["solution_id"], "Mismatched solution_id"
sample["pass"] = is_pass["pass"]
return code_samples
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment