step1: evaluate code evaluate code multi-times to obtain a consistent result

8db3b29c · nzy · 6d8e17dc · 8db3b29c
Commit 8db3b29c authored Oct 06, 2024 by nzy
Hide whitespace changes
Inline Side-by-side

Showing with 56 additions and 14 deletions

step1_evaluate_code.py
+56 -14

No files found.
--- a/step1_evaluate_code.py
+++ b/step1_evaluate_code.py
@@ -17,12 +17,15 @@ def check_correctness(sample, generation, timeout, debug=False):
    """Check correctness of code generation with a global timeout.
    The global timeout is to catch some extreme/rare cases not handled by the timeouts
    inside `run_test`"""
+
    def _temp_run(sample, generation, debug, result):
        result.append(run_test(sample, test=generation, debug=debug))

    manager = multiprocessing.Manager()
    result = manager.list()
-    p = multiprocessing.Process(target=_temp_run, args=(sample, generation, debug, result))
+    p = multiprocessing.Process(
+        target=_temp_run, args=(sample, generation, debug, result)
+    )
    p.start()
    p.join(timeout=timeout + 1)
    if p.is_alive():
@@ -71,27 +74,66 @@ def test_generation(args, debug=False):
    return code_sample


-def evaluate_code_samples(code_samples: list, dataset_path: str):
-    apps_eval = load_dataset(dataset_path)
+def get_apps_item(item, apps):
+    problem_id = item["problem_id"]
+    split, idx = problem_id.split("_")
+    # get corresponding samples from APPS dataset
+    return apps[split][int(idx)]

-    def get_apps_item(item):
-        problem_id = item["problem_id"]
-        split, idx = problem_id.split('_')
-        # get corresponding samples from APPS dataset
-        return apps_eval[split][int(idx)]
-
-    args = [(get_apps_item(sample), sample) for sample in code_samples]

+def evaluate_code_samples(code_samples, apps):
+    args = [(get_apps_item(sample, apps), sample) for sample in code_samples]
    cpu_num = multiprocessing.cpu_count()
-    # TODO `chunksize` affects performance a lot
-    results = process_map(test_generation, args, max_workers=cpu_num, chunksize=1000)
-
+    chunksize = len(code_samples) // (cpu_num * 5)
+    results = process_map(
+        test_generation, args, max_workers=cpu_num, chunksize=chunksize
+    )
    return results


+def evaluate_incorrect_code_samples_again(results, apps, loop_num):
+    """
+    There are some strange bugs in apps evaluation that cannot be reproduced.
+    The observable issue is that the same code will yield different 'eval_result' values.
+    Typically, the test framework may encounter an exception or decide that the code has timed out unreasonably.
+
+    This function is an ugly workaround to address this problem:
+    If the function returns a timeout result or raises an exception, it will be run twice to verify if the result is consistent.
+    The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result.
+    """
+    maybe_incorrect_lst, correct_lst = [], []
+    for item in results:
+        if any(x in item["eval_result"] for x in (-1, -2)):
+            maybe_incorrect_lst.append(item)
+        else:
+            correct_lst.append(item)
+
+    print(f"maybe incorrect lst size: {len(maybe_incorrect_lst)}")
+
+    for _ in range(loop_num):
+        if len(maybe_incorrect_lst) == 0:
+            break
+
+        new_results = evaluate_code_samples(maybe_incorrect_lst, apps)
+        for i, (old_item, new_item) in enumerate(zip(maybe_incorrect_lst, new_results)):
+            old_eval, new_eval = old_item["eval_results"], new_item["eval_results"]
+            if old_eval == new_eval:
+                item = maybe_incorrect_lst.pop(i)
+                correct_lst.append(item)
+            else:
+                maybe_incorrect_lst[i] = new_item
+
+    assert len(results) == len(correct_lst), "cannot correctly evaluate codes" + str(
+        maybe_incorrect_lst
+    )
+    return correct_lst
+
+
 def evaluate(code_sample_path, dataset_path, output_path):
    code_samples = load_jsonl(code_sample_path)
-    results  = evaluate_code_samples(code_samples, dataset_path)
+    apps = load_dataset(dataset_path)
+    results = evaluate_code_samples(code_samples, apps)
+    results = evaluate_incorrect_code_samples_again(results, apps, 5)
    save_jsonl(results, output_path)