Commit 94ea5cb1 by nzy

new evaluation logic

parent b1db6cb1
...@@ -6,9 +6,9 @@ import multiprocessing ...@@ -6,9 +6,9 @@ import multiprocessing
import numpy as np import numpy as np
from tqdm.contrib.concurrent import process_map from tqdm.contrib.concurrent import process_map
from datasets import load_dataset
from codecritic.evaluation.apps_exec import run_test from codecritic.evaluation.apps_exec import run_test
from codecritic.utils.json import save_jsonl
from codecritic.dataset.code import extract_code
TIMEOUT = 10 TIMEOUT = 10
...@@ -41,7 +41,7 @@ def check_correctness(sample, generation, timeout, debug=False): ...@@ -41,7 +41,7 @@ def check_correctness(sample, generation, timeout, debug=False):
def test_generation(args, debug=False): def test_generation(args, debug=False):
apps_item, sample = args apps_item, sample = args
code = extract_code(sample["response"][0]["content"]) code = sample["meta_clean_code"]
curr_res = [-2] curr_res = [-2]
try: try:
...@@ -68,18 +68,20 @@ def test_generation(args, debug=False): ...@@ -68,18 +68,20 @@ def test_generation(args, debug=False):
problem_result = np.asarray(curr_res) problem_result = np.asarray(curr_res)
return { return {
**sample, "task_id": sample["task_id"],
"code": code, "solution_id": sample["solution_id"],
"eval_result": bool(np.all(problem_result > 0)), "pass": bool(np.all(problem_result > 0)),
"testcase": curr_res "timeout": bool(-1 in curr_res),
"compilerr": bool(-2 in curr_res),
} }
def evaluate_code_samples(code_samples, apps): def evaluate_code_samples(code_samples, apps):
args = [] args = []
for sample in code_samples: for sample in code_samples:
problem_id = sample["problem_id"] task_id = sample["task_id"]
args.append((apps["test"][int(problem_id)], sample)) split, idx = task_id.split('-')
args.append((apps[split][int(idx)], sample))
cpu_num = multiprocessing.cpu_count() cpu_num = multiprocessing.cpu_count()
chunksize = max(len(code_samples) // (cpu_num * 5), 1) chunksize = max(len(code_samples) // (cpu_num * 5), 1)
...@@ -90,7 +92,7 @@ def evaluate_code_samples(code_samples, apps): ...@@ -90,7 +92,7 @@ def evaluate_code_samples(code_samples, apps):
return results return results
def evaluate_incorrect_code_samples_again(results, apps, loop_num): def evaluate(code_samples, apps_path):
""" """
There are some strange bugs in apps evaluation that cannot be reproduced. There are some strange bugs in apps evaluation that cannot be reproduced.
The observable issue is that the same code will yield different 'eval_result' values. The observable issue is that the same code will yield different 'eval_result' values.
...@@ -100,42 +102,35 @@ def evaluate_incorrect_code_samples_again(results, apps, loop_num): ...@@ -100,42 +102,35 @@ def evaluate_incorrect_code_samples_again(results, apps, loop_num):
Run twice to verify if the result is consistent. Run twice to verify if the result is consistent.
The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result. The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result.
""" """
maybe_incorrect_lst, correct_lst = [], [] apps = load_dataset(apps_path)
for item in results: all_results = []
if any(x in item["testcase"] for x in (-1, -2)): for _ in range(3):
maybe_incorrect_lst.append(item) results = evaluate_code_samples(code_samples, apps)
all_results.append(results)
final_results = []
for lst in map(list, zip(*all_results)):
assert len(set(x["task_id"] for x in lst)) == 1, "Mismatched task_id"
assert len(set(x["solution_id"] for x in lst)) == 1, "Mismatched solution_id"
task_id, solution_id = lst[0]["task_id"], lst[0]["solution_id"]
if all(x["compilerr"] for x in lst):
is_pass = False
else: else:
correct_lst.append(item) # If there is a compilation error in any of the multiple runs, treat it as an exception and remove it.
lst = [x for x in lst if not x["compilerr"]]
for _ in range(loop_num): is_pass = all(x["pass"] for x in lst)
if len(maybe_incorrect_lst) == 0:
break final_results.append({
"task_id": task_id,
new_results = evaluate_code_samples(maybe_incorrect_lst, apps) "solution_id": solution_id,
print(f"maybe incorrect lst size: {len(maybe_incorrect_lst)}") "pass": is_pass
check_lst = [] })
for i in range(len(new_results)):
old_item, new_item = maybe_incorrect_lst[i], new_results[i] for sample, is_pass in zip(code_samples, final_results):
old_eval, new_eval = old_item["eval_result"], new_item["eval_result"] assert sample["task_id"] == is_pass["task_id"], "Mismatched task_id"
if old_eval == new_eval: assert sample["solution_id"] == is_pass["solution_id"], "Mismatched solution_id"
correct_lst.append(old_item) sample["pass"] = is_pass["pass"]
else:
check_lst.append(new_item) return code_samples
print(old_item["problem_id"], old_eval, new_item["problem_id"], new_eval)
maybe_incorrect_lst = check_lst
if len(results) != len(correct_lst):
save_jsonl(maybe_incorrect_lst, "debug.jsonl")
# raise ValueError("cannot correctly evaluate codes")
print("cannot correctly evalute code. see debug.jsonl")
if len(maybe_incorrect_lst) < 5:
correct_lst.extend(maybe_incorrect_lst)
return correct_lst
def evaluate(code_samples, apps):
results = evaluate_code_samples(code_samples, apps)
results = evaluate_incorrect_code_samples_again(results, apps, 10)
return results
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment