new apps sample

d11a2acf · nzy · 94ea5cb1 · d11a2acf · d11a2acf · 94ea5cb1
Commit d11a2acf authored Dec 28, 2024 by nzy
Hide whitespace changes
Inline Side-by-side

Showing with 109 additions and 40 deletions

README.md
+1 -0

codecritic/cli/gen_dataset.py
+104 -0

codecritic/cli/mk_rm_dataset.py
+0 -33

codecritic/cli/test_genrm.py
+2 -2

codecritic/evaluation/apps_eval.py
+2 -5

No files found.
--- a/README.md
+++ b/README.md
@@ -22,6 +22,7 @@
    "pass": "boolean, indicates whether the solution passed the task",
    "skip": "boolean, set to True if no solution passes this task",
    "messages": "list of dictionaries, conversation messages in OpenAI format",
+    "code": "clean code",
    "positive_score": "float, probability of the 'Yes' token",
    "negative_score": "float, probability of the 'No' token",
    "meta_***": "any additional data or custom fields",

--- a/codecritic/cli/gen_dataset.py
+++ b/codecritic/cli/gen_dataset.py
+import argparse
+import json
+from functools import partial
+from collections import defaultdict
+
+from datasets import load_dataset
+from vllm import SamplingParams
+from transformers import AutoTokenizer
+
+
+from codecritic.dataset.apps import mk_prompt
+from codecritic.dataset.code import extract_code
+from codecritic.evaluation.apps_eval import evaluate
+from codecritic.utils.inference import generate_worker
+from codecritic.utils.parallel import model_map
+from codecritic.utils.json import save_jsonl
+
+
+def transform_to_prompt(apps, tokenizer):
+    prompts = []
+    for split in ["train", "test"]:
+        dataset = apps[split]
+        for item in dataset:
+            task_id = split + "-" + str(item["id"])
+            try:
+                json.loads(item["input_output"])
+            except ValueError:
+                print(f"Skipping {task_id}: Invalid JSON in input_output")
+                continue
+
+            prompt = mk_prompt(item)
+
+            # Filter long prompts
+            tokenized_question = tokenizer.apply_chat_template(prompt, tokenize=True)
+            length = len(tokenized_question)
+            if length > 2048:
+                print(f"Skipping {task_id}: Token length {length} exceeds limit")
+                continue
+
+            prompts.append(
+                {
+                    "dataset": "apps-" + item["difficulty"],
+                    "task_id": "task_id",
+                    "messages": prompt,
+                }
+            )
+
+    return prompts
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, help="path/to/model")
+    parser.add_argument("--apps", type=str, help="path/to/apps")
+    parser.add_argument("--train", type=str, help="path/to/train")
+    parser.add_argument("--test", type=str, help="path/to/test")
+    parser.add_argument(
+        "--gpu", type=int, default=1, help="gpu number required by one model"
+    )
+    args = parser.parse_args()
+
+    apps = load_dataset(args.apps)
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    dataset = transform_to_prompt(apps, tokenizer)
+
+    # sampling
+    sampling_params = SamplingParams(
+        n=50,
+        temperature=0.8,
+        top_p=0.95,
+        max_tokens=2048,
+    )
+
+    worker = partial(
+        generate_worker, model_path=args.model, sampling_params=sampling_params
+    )
+    dataset = model_map(worker, dataset, args.gpu)
+
+    # postprocess
+    grouped = defaultdict(list)
+    for sample in dataset:
+        grouped[sample["task_id"]] = sample
+
+    def is_in_test(task_id):
+        split, idx = task_id.split("-")
+        if split == "test":
+            for start, end in [(0, 300), (3000, 3100), (4000, 4100)]:
+                if start <= idx < end:
+                    return True
+        return False
+
+    trainset, testset = [], []
+    for task_id, group in grouped.items():
+        target = testset if is_in_test(task_id) else trainset
+        for idx, sample in enumerate(group):
+            sample["solution_id"] = idx
+            sample["code"] = extract_code(sample["messages"][-1]["content"])
+            target.append(sample)
+
+    trainset = evaluate(trainset, apps)
+    testset = evaluate(testset, apps)
+
+    save_jsonl(trainset, args.train)
+    save_jsonl(testset, args.test)
--- a/codecritic/cli/mk_rm_dataset.py
+++ b/codecritic/cli/mk_rm_dataset.py
-import argparse
-from pathlib import Path
-from codecritic.utils.json import load_json
-from codecritic.dataset.utils import save_jsonl_dataset
-
-from codecritic.dataset.edit_distance import (
-    mk_problem_groups,
-    calculate_edit_distances,
-    mk_edit_distance_dataset,
-)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--dataset_dir", type=str)
-    parser.add_argument("--output_dir", type=str)
-    parser.add_argument("--is_max", type=bool, required=True)
-    args = parser.parse_args()
-
-    dataset_dir = Path(args.dataset_dir)
-    train_path = dataset_dir / "train.jsonl"
-    sampling_params = load_json(dataset_dir / "sampling_params.json")
-    problems = mk_problem_groups(train_path, sampling_params["n"])
-
-    all_edit_distance_pairs = calculate_edit_distances(problems)
-
-    postfix = "max" if args.is_max else "min"
-    dataset_name = f"apps_edit_distance_{postfix}"
-    preference_pairs, metadata = mk_edit_distance_dataset(
-        all_edit_distance_pairs, 10 * 1000, 5, is_max=args.is_max
-    )
-
-    save_jsonl_dataset(preference_pairs, args.output_dir)
--- a/codecritic/cli/test_genrm.py
+++ b/codecritic/cli/test_genrm.py
@@ -24,7 +24,7 @@ if __name__ == "__main__":
        help="maximum number of tokens allowed for the reasoning process.",
    )
    parser.add_argument(
-        "--gpu", type=int, default=1, help="gpu number required by model"
+        "--gpu", type=int, default=1, help="gpu number required by one model"
    )
    args = parser.parse_args()

@@ -47,7 +47,7 @@ if __name__ == "__main__":
        worker = partial(
            generate_worker, model_path=args.model, sampling_params=sampling_params
        )
-        dataset = model_map(worker, dataset, args.gpu_per_model)
+        dataset = model_map(worker, dataset, args.gpu)

    def get_token_id(token):
        score_tokens = tokenizer.encode(token, add_special_tokens=False)

--- a/codecritic/evaluation/apps_eval.py
+++ b/codecritic/evaluation/apps_eval.py
@@ -6,8 +6,6 @@ import multiprocessing
 import numpy as np
 from tqdm.contrib.concurrent import process_map

-from datasets import load_dataset
-
 from codecritic.evaluation.apps_exec import run_test

 TIMEOUT = 10
@@ -41,7 +39,7 @@ def check_correctness(sample, generation, timeout, debug=False):

 def test_generation(args, debug=False):
    apps_item, sample = args
-    code = sample["meta_clean_code"]
+    code = sample["code"]

    curr_res = [-2]
    try:
@@ -92,7 +90,7 @@ def evaluate_code_samples(code_samples, apps):
    return results


-def evaluate(code_samples, apps_path):
+def evaluate(code_samples, apps):
    """
    There are some strange bugs in apps evaluation that cannot be reproduced.
    The observable issue is that the same code will yield different 'eval_result' values.
@@ -102,7 +100,6 @@ def evaluate(code_samples, apps_path):
    Run twice to verify if the result is consistent.
    The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result.
    """
-    apps = load_dataset(apps_path)
    all_results = []
    for _ in range(3):
        results = evaluate_code_samples(code_samples, apps)