improve eval_apps

91cf4380 · nzy · b319f162 · 91cf4380 · 91cf4380 · 91cf4380
Commit 91cf4380 authored Dec 29, 2024 by nzy
Hide whitespace changes
Inline Side-by-side

Showing with 62 additions and 42 deletions

README.md
+7 -0

codecritic/cli/gen_dataset.py
+50 -38

codecritic/cli/test_genrm.py
+2 -2

codecritic/evaluation/apps_eval.py
+3 -2

No files found.
--- a/README.md
+++ b/README.md
 # CodeCritic

+## Installation
+
+```
+pip install scikit-learn
+pip install 
+```
+
 ## Evaluation

 ### APPS-500 (Top@k)

--- a/codecritic/cli/gen_dataset.py
+++ b/codecritic/cli/gen_dataset.py
 import argparse
+from pathlib import Path
 import os
 import json
 from functools import partial
@@ -14,7 +15,7 @@ from codecritic.dataset.code import extract_code
 from codecritic.evaluation.apps_eval import evaluate
 from codecritic.utils.inference import generate_worker
 from codecritic.utils.parallel import model_map
-from codecritic.utils.json import save_jsonl
+from codecritic.utils.json import load_jsonl, save_jsonl


 def transform_to_prompt(apps, tokenizer):
@@ -56,49 +57,60 @@ if __name__ == "__main__":
    parser.add_argument("--train", type=str, help="path/to/train")
    parser.add_argument("--test", type=str, help="path/to/test")
    parser.add_argument(
-        "--gpu", type=int, default=1, help="gpu number required by one model"
+        "--tp", type=int, default=1, help="tensor parallel"
    )
    args = parser.parse_args()
    os.environ["TOKENIZERS_PARALLELISM"] = "false"

    apps = load_dataset(args.apps)
-    tokenizer = AutoTokenizer.from_pretrained(args.model)
-    dataset = transform_to_prompt(apps, tokenizer)
-
-    # sampling
-    sampling_params = SamplingParams(
-        n=50,
-        temperature=0.8,
-        top_p=0.95,
-        max_tokens=2048,
-    )

-    worker = partial(
-        generate_worker, model_path=args.model, sampling_params=sampling_params
-    )
-    dataset = model_map(worker, dataset, args.gpu)
-
-    # postprocess
-    grouped = defaultdict(list)
-    for sample in dataset:
-        grouped[sample["task_id"]].append(sample)
-
-    def is_in_test(task_id):
-        split, idx = task_id.split("-")
-        idx = int(idx)
-        if split == "test":
-            for start, end in [(0, 300), (3000, 3100), (4000, 4100)]:
-                if start <= idx < end:
-                    return True
-        return False
-
-    trainset, testset = [], []
-    for task_id, group in grouped.items():
-        target = testset if is_in_test(task_id) else trainset
-        for idx, sample in enumerate(group):
-            sample["solution_id"] = idx
-            sample["code"] = extract_code(sample["messages"][-1]["content"])
-            target.append(sample)
+    train_raw_path = Path(args.train + ".raw")
+    test_raw_path = Path(args.test + ".raw")
+
+    if not (train_raw_path.exists() and test_raw_path.exists()):
+        tokenizer = AutoTokenizer.from_pretrained(args.model)
+        dataset = transform_to_prompt(apps, tokenizer)
+
+        # sampling
+        sampling_params = SamplingParams(
+            n=50,
+            temperature=0.8,
+            top_p=0.95,
+            max_tokens=2048,
+        )
+
+        worker = partial(
+            generate_worker, model_path=args.model, sampling_params=sampling_params
+        )
+        dataset = model_map(worker, dataset, args.tp)
+
+        # postprocess
+        grouped = defaultdict(list)
+        for sample in dataset:
+            grouped[sample["task_id"]].append(sample)
+
+        def is_in_test(task_id):
+            split, idx = task_id.split("-")
+            idx = int(idx)
+            if split == "test":
+                for start, end in [(0, 300), (3000, 3100), (4000, 4100)]:
+                    if start <= idx < end:
+                        return True
+            return False
+
+        trainset, testset = [], []
+        for task_id, group in grouped.items():
+            target = testset if is_in_test(task_id) else trainset
+            for idx, sample in enumerate(group):
+                sample["solution_id"] = idx
+                sample["code"] = extract_code(sample["messages"][-1]["content"])
+                target.append(sample)
+
+        save_jsonl(trainset, train_raw_path)
+        save_jsonl(testset, test_raw_path)
+    else:
+        trainset = load_jsonl(train_raw_path)
+        testset = load_jsonl(test_raw_path)

    trainset = evaluate(trainset, apps)
    testset = evaluate(testset, apps)

--- a/codecritic/cli/test_genrm.py
+++ b/codecritic/cli/test_genrm.py
@@ -24,7 +24,7 @@ if __name__ == "__main__":
        help="maximum number of tokens allowed for the reasoning process.",
    )
    parser.add_argument(
-        "--gpu", type=int, default=1, help="gpu number required by one model"
+        "--tp", type=int, default=1, help="tensor parallel"
    )
    args = parser.parse_args()

@@ -47,7 +47,7 @@ if __name__ == "__main__":
        worker = partial(
            generate_worker, model_path=args.model, sampling_params=sampling_params
        )
-        dataset = model_map(worker, dataset, args.gpu)
+        dataset = model_map(worker, dataset, args.tp)

    def get_token_id(token):
        score_tokens = tokenizer.encode(token, add_special_tokens=False)

--- a/codecritic/evaluation/apps_eval.py
+++ b/codecritic/evaluation/apps_eval.py
@@ -82,7 +82,8 @@ def evaluate_code_samples(code_samples, apps):
        args.append((apps[split][int(idx)], sample))

    cpu_num = multiprocessing.cpu_count()
-    chunksize = max(len(code_samples) // (cpu_num * 5), 1)
+    # chunksize = max(len(code_samples) // (cpu_num * 5), 1)
+    chunksize = 10000
    # TODO performance?
    results = process_map(
        test_generation, args, max_workers=cpu_num, chunksize=chunksize
@@ -101,7 +102,7 @@ def evaluate(code_samples, apps):
    The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result.
    """
    all_results = []
-    for _ in range(3):
+    for _ in range(2):
        results = evaluate_code_samples(code_samples, apps)
        all_results.append(results)