Commit d11a2acf by nzy

new apps sample

parent 94ea5cb1
......@@ -22,6 +22,7 @@
"pass": "boolean, indicates whether the solution passed the task",
"skip": "boolean, set to True if no solution passes this task",
"messages": "list of dictionaries, conversation messages in OpenAI format",
"code": "clean code",
"positive_score": "float, probability of the 'Yes' token",
"negative_score": "float, probability of the 'No' token",
"meta_***": "any additional data or custom fields",
......
import argparse
import json
from functools import partial
from collections import defaultdict
from datasets import load_dataset
from vllm import SamplingParams
from transformers import AutoTokenizer
from codecritic.dataset.apps import mk_prompt
from codecritic.dataset.code import extract_code
from codecritic.evaluation.apps_eval import evaluate
from codecritic.utils.inference import generate_worker
from codecritic.utils.parallel import model_map
from codecritic.utils.json import save_jsonl
def transform_to_prompt(apps, tokenizer):
prompts = []
for split in ["train", "test"]:
dataset = apps[split]
for item in dataset:
task_id = split + "-" + str(item["id"])
try:
json.loads(item["input_output"])
except ValueError:
print(f"Skipping {task_id}: Invalid JSON in input_output")
continue
prompt = mk_prompt(item)
# Filter long prompts
tokenized_question = tokenizer.apply_chat_template(prompt, tokenize=True)
length = len(tokenized_question)
if length > 2048:
print(f"Skipping {task_id}: Token length {length} exceeds limit")
continue
prompts.append(
{
"dataset": "apps-" + item["difficulty"],
"task_id": "task_id",
"messages": prompt,
}
)
return prompts
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="path/to/model")
parser.add_argument("--apps", type=str, help="path/to/apps")
parser.add_argument("--train", type=str, help="path/to/train")
parser.add_argument("--test", type=str, help="path/to/test")
parser.add_argument(
"--gpu", type=int, default=1, help="gpu number required by one model"
)
args = parser.parse_args()
apps = load_dataset(args.apps)
tokenizer = AutoTokenizer.from_pretrained(args.model)
dataset = transform_to_prompt(apps, tokenizer)
# sampling
sampling_params = SamplingParams(
n=50,
temperature=0.8,
top_p=0.95,
max_tokens=2048,
)
worker = partial(
generate_worker, model_path=args.model, sampling_params=sampling_params
)
dataset = model_map(worker, dataset, args.gpu)
# postprocess
grouped = defaultdict(list)
for sample in dataset:
grouped[sample["task_id"]] = sample
def is_in_test(task_id):
split, idx = task_id.split("-")
if split == "test":
for start, end in [(0, 300), (3000, 3100), (4000, 4100)]:
if start <= idx < end:
return True
return False
trainset, testset = [], []
for task_id, group in grouped.items():
target = testset if is_in_test(task_id) else trainset
for idx, sample in enumerate(group):
sample["solution_id"] = idx
sample["code"] = extract_code(sample["messages"][-1]["content"])
target.append(sample)
trainset = evaluate(trainset, apps)
testset = evaluate(testset, apps)
save_jsonl(trainset, args.train)
save_jsonl(testset, args.test)
import argparse
from pathlib import Path
from codecritic.utils.json import load_json
from codecritic.dataset.utils import save_jsonl_dataset
from codecritic.dataset.edit_distance import (
mk_problem_groups,
calculate_edit_distances,
mk_edit_distance_dataset,
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--dataset_dir", type=str)
parser.add_argument("--output_dir", type=str)
parser.add_argument("--is_max", type=bool, required=True)
args = parser.parse_args()
dataset_dir = Path(args.dataset_dir)
train_path = dataset_dir / "train.jsonl"
sampling_params = load_json(dataset_dir / "sampling_params.json")
problems = mk_problem_groups(train_path, sampling_params["n"])
all_edit_distance_pairs = calculate_edit_distances(problems)
postfix = "max" if args.is_max else "min"
dataset_name = f"apps_edit_distance_{postfix}"
preference_pairs, metadata = mk_edit_distance_dataset(
all_edit_distance_pairs, 10 * 1000, 5, is_max=args.is_max
)
save_jsonl_dataset(preference_pairs, args.output_dir)
......@@ -24,7 +24,7 @@ if __name__ == "__main__":
help="maximum number of tokens allowed for the reasoning process.",
)
parser.add_argument(
"--gpu", type=int, default=1, help="gpu number required by model"
"--gpu", type=int, default=1, help="gpu number required by one model"
)
args = parser.parse_args()
......@@ -47,7 +47,7 @@ if __name__ == "__main__":
worker = partial(
generate_worker, model_path=args.model, sampling_params=sampling_params
)
dataset = model_map(worker, dataset, args.gpu_per_model)
dataset = model_map(worker, dataset, args.gpu)
def get_token_id(token):
score_tokens = tokenizer.encode(token, add_special_tokens=False)
......
......@@ -6,8 +6,6 @@ import multiprocessing
import numpy as np
from tqdm.contrib.concurrent import process_map
from datasets import load_dataset
from codecritic.evaluation.apps_exec import run_test
TIMEOUT = 10
......@@ -41,7 +39,7 @@ def check_correctness(sample, generation, timeout, debug=False):
def test_generation(args, debug=False):
apps_item, sample = args
code = sample["meta_clean_code"]
code = sample["code"]
curr_res = [-2]
try:
......@@ -92,7 +90,7 @@ def evaluate_code_samples(code_samples, apps):
return results
def evaluate(code_samples, apps_path):
def evaluate(code_samples, apps):
"""
There are some strange bugs in apps evaluation that cannot be reproduced.
The observable issue is that the same code will yield different 'eval_result' values.
......@@ -102,7 +100,6 @@ def evaluate(code_samples, apps_path):
Run twice to verify if the result is consistent.
The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result.
"""
apps = load_dataset(apps_path)
all_results = []
for _ in range(3):
results = evaluate_code_samples(code_samples, apps)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment