Commit 8db3b29c by nzy

step1: evaluate code evaluate code multi-times to obtain a consistent result

parent 6d8e17dc
......@@ -17,12 +17,15 @@ def check_correctness(sample, generation, timeout, debug=False):
"""Check correctness of code generation with a global timeout.
The global timeout is to catch some extreme/rare cases not handled by the timeouts
inside `run_test`"""
def _temp_run(sample, generation, debug, result):
result.append(run_test(sample, test=generation, debug=debug))
manager = multiprocessing.Manager()
result = manager.list()
p = multiprocessing.Process(target=_temp_run, args=(sample, generation, debug, result))
p = multiprocessing.Process(
target=_temp_run, args=(sample, generation, debug, result)
)
p.start()
p.join(timeout=timeout + 1)
if p.is_alive():
......@@ -71,27 +74,66 @@ def test_generation(args, debug=False):
return code_sample
def evaluate_code_samples(code_samples: list, dataset_path: str):
apps_eval = load_dataset(dataset_path)
def get_apps_item(item, apps):
problem_id = item["problem_id"]
split, idx = problem_id.split("_")
# get corresponding samples from APPS dataset
return apps[split][int(idx)]
def get_apps_item(item):
problem_id = item["problem_id"]
split, idx = problem_id.split('_')
# get corresponding samples from APPS dataset
return apps_eval[split][int(idx)]
args = [(get_apps_item(sample), sample) for sample in code_samples]
def evaluate_code_samples(code_samples, apps):
args = [(get_apps_item(sample, apps), sample) for sample in code_samples]
cpu_num = multiprocessing.cpu_count()
# TODO `chunksize` affects performance a lot
results = process_map(test_generation, args, max_workers=cpu_num, chunksize=1000)
chunksize = len(code_samples) // (cpu_num * 5)
results = process_map(
test_generation, args, max_workers=cpu_num, chunksize=chunksize
)
return results
def evaluate_incorrect_code_samples_again(results, apps, loop_num):
"""
There are some strange bugs in apps evaluation that cannot be reproduced.
The observable issue is that the same code will yield different 'eval_result' values.
Typically, the test framework may encounter an exception or decide that the code has timed out unreasonably.
This function is an ugly workaround to address this problem:
If the function returns a timeout result or raises an exception, it will be run twice to verify if the result is consistent.
The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result.
"""
maybe_incorrect_lst, correct_lst = [], []
for item in results:
if any(x in item["eval_result"] for x in (-1, -2)):
maybe_incorrect_lst.append(item)
else:
correct_lst.append(item)
print(f"maybe incorrect lst size: {len(maybe_incorrect_lst)}")
for _ in range(loop_num):
if len(maybe_incorrect_lst) == 0:
break
new_results = evaluate_code_samples(maybe_incorrect_lst, apps)
for i, (old_item, new_item) in enumerate(zip(maybe_incorrect_lst, new_results)):
old_eval, new_eval = old_item["eval_results"], new_item["eval_results"]
if old_eval == new_eval:
item = maybe_incorrect_lst.pop(i)
correct_lst.append(item)
else:
maybe_incorrect_lst[i] = new_item
assert len(results) == len(correct_lst), "cannot correctly evaluate codes" + str(
maybe_incorrect_lst
)
return correct_lst
def evaluate(code_sample_path, dataset_path, output_path):
code_samples = load_jsonl(code_sample_path)
results = evaluate_code_samples(code_samples, dataset_path)
apps = load_dataset(dataset_path)
results = evaluate_code_samples(code_samples, apps)
results = evaluate_incorrect_code_samples_again(results, apps, 5)
save_jsonl(results, output_path)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment