Commit d11a2acf by nzy

new apps sample

parent 94ea5cb1
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
"pass": "boolean, indicates whether the solution passed the task", "pass": "boolean, indicates whether the solution passed the task",
"skip": "boolean, set to True if no solution passes this task", "skip": "boolean, set to True if no solution passes this task",
"messages": "list of dictionaries, conversation messages in OpenAI format", "messages": "list of dictionaries, conversation messages in OpenAI format",
"code": "clean code",
"positive_score": "float, probability of the 'Yes' token", "positive_score": "float, probability of the 'Yes' token",
"negative_score": "float, probability of the 'No' token", "negative_score": "float, probability of the 'No' token",
"meta_***": "any additional data or custom fields", "meta_***": "any additional data or custom fields",
......
import argparse
import json
from functools import partial
from collections import defaultdict
from datasets import load_dataset
from vllm import SamplingParams
from transformers import AutoTokenizer
from codecritic.dataset.apps import mk_prompt
from codecritic.dataset.code import extract_code
from codecritic.evaluation.apps_eval import evaluate
from codecritic.utils.inference import generate_worker
from codecritic.utils.parallel import model_map
from codecritic.utils.json import save_jsonl
def transform_to_prompt(apps, tokenizer):
prompts = []
for split in ["train", "test"]:
dataset = apps[split]
for item in dataset:
task_id = split + "-" + str(item["id"])
try:
json.loads(item["input_output"])
except ValueError:
print(f"Skipping {task_id}: Invalid JSON in input_output")
continue
prompt = mk_prompt(item)
# Filter long prompts
tokenized_question = tokenizer.apply_chat_template(prompt, tokenize=True)
length = len(tokenized_question)
if length > 2048:
print(f"Skipping {task_id}: Token length {length} exceeds limit")
continue
prompts.append(
{
"dataset": "apps-" + item["difficulty"],
"task_id": "task_id",
"messages": prompt,
}
)
return prompts
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="path/to/model")
parser.add_argument("--apps", type=str, help="path/to/apps")
parser.add_argument("--train", type=str, help="path/to/train")
parser.add_argument("--test", type=str, help="path/to/test")
parser.add_argument(
"--gpu", type=int, default=1, help="gpu number required by one model"
)
args = parser.parse_args()
apps = load_dataset(args.apps)
tokenizer = AutoTokenizer.from_pretrained(args.model)
dataset = transform_to_prompt(apps, tokenizer)
# sampling
sampling_params = SamplingParams(
n=50,
temperature=0.8,
top_p=0.95,
max_tokens=2048,
)
worker = partial(
generate_worker, model_path=args.model, sampling_params=sampling_params
)
dataset = model_map(worker, dataset, args.gpu)
# postprocess
grouped = defaultdict(list)
for sample in dataset:
grouped[sample["task_id"]] = sample
def is_in_test(task_id):
split, idx = task_id.split("-")
if split == "test":
for start, end in [(0, 300), (3000, 3100), (4000, 4100)]:
if start <= idx < end:
return True
return False
trainset, testset = [], []
for task_id, group in grouped.items():
target = testset if is_in_test(task_id) else trainset
for idx, sample in enumerate(group):
sample["solution_id"] = idx
sample["code"] = extract_code(sample["messages"][-1]["content"])
target.append(sample)
trainset = evaluate(trainset, apps)
testset = evaluate(testset, apps)
save_jsonl(trainset, args.train)
save_jsonl(testset, args.test)
import argparse
from pathlib import Path
from codecritic.utils.json import load_json
from codecritic.dataset.utils import save_jsonl_dataset
from codecritic.dataset.edit_distance import (
mk_problem_groups,
calculate_edit_distances,
mk_edit_distance_dataset,
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--dataset_dir", type=str)
parser.add_argument("--output_dir", type=str)
parser.add_argument("--is_max", type=bool, required=True)
args = parser.parse_args()
dataset_dir = Path(args.dataset_dir)
train_path = dataset_dir / "train.jsonl"
sampling_params = load_json(dataset_dir / "sampling_params.json")
problems = mk_problem_groups(train_path, sampling_params["n"])
all_edit_distance_pairs = calculate_edit_distances(problems)
postfix = "max" if args.is_max else "min"
dataset_name = f"apps_edit_distance_{postfix}"
preference_pairs, metadata = mk_edit_distance_dataset(
all_edit_distance_pairs, 10 * 1000, 5, is_max=args.is_max
)
save_jsonl_dataset(preference_pairs, args.output_dir)
...@@ -24,7 +24,7 @@ if __name__ == "__main__": ...@@ -24,7 +24,7 @@ if __name__ == "__main__":
help="maximum number of tokens allowed for the reasoning process.", help="maximum number of tokens allowed for the reasoning process.",
) )
parser.add_argument( parser.add_argument(
"--gpu", type=int, default=1, help="gpu number required by model" "--gpu", type=int, default=1, help="gpu number required by one model"
) )
args = parser.parse_args() args = parser.parse_args()
...@@ -47,7 +47,7 @@ if __name__ == "__main__": ...@@ -47,7 +47,7 @@ if __name__ == "__main__":
worker = partial( worker = partial(
generate_worker, model_path=args.model, sampling_params=sampling_params generate_worker, model_path=args.model, sampling_params=sampling_params
) )
dataset = model_map(worker, dataset, args.gpu_per_model) dataset = model_map(worker, dataset, args.gpu)
def get_token_id(token): def get_token_id(token):
score_tokens = tokenizer.encode(token, add_special_tokens=False) score_tokens = tokenizer.encode(token, add_special_tokens=False)
......
...@@ -6,8 +6,6 @@ import multiprocessing ...@@ -6,8 +6,6 @@ import multiprocessing
import numpy as np import numpy as np
from tqdm.contrib.concurrent import process_map from tqdm.contrib.concurrent import process_map
from datasets import load_dataset
from codecritic.evaluation.apps_exec import run_test from codecritic.evaluation.apps_exec import run_test
TIMEOUT = 10 TIMEOUT = 10
...@@ -41,7 +39,7 @@ def check_correctness(sample, generation, timeout, debug=False): ...@@ -41,7 +39,7 @@ def check_correctness(sample, generation, timeout, debug=False):
def test_generation(args, debug=False): def test_generation(args, debug=False):
apps_item, sample = args apps_item, sample = args
code = sample["meta_clean_code"] code = sample["code"]
curr_res = [-2] curr_res = [-2]
try: try:
...@@ -92,7 +90,7 @@ def evaluate_code_samples(code_samples, apps): ...@@ -92,7 +90,7 @@ def evaluate_code_samples(code_samples, apps):
return results return results
def evaluate(code_samples, apps_path): def evaluate(code_samples, apps):
""" """
There are some strange bugs in apps evaluation that cannot be reproduced. There are some strange bugs in apps evaluation that cannot be reproduced.
The observable issue is that the same code will yield different 'eval_result' values. The observable issue is that the same code will yield different 'eval_result' values.
...@@ -102,7 +100,6 @@ def evaluate(code_samples, apps_path): ...@@ -102,7 +100,6 @@ def evaluate(code_samples, apps_path):
Run twice to verify if the result is consistent. Run twice to verify if the result is consistent.
The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result. The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result.
""" """
apps = load_dataset(apps_path)
all_results = [] all_results = []
for _ in range(3): for _ in range(3):
results = evaluate_code_samples(code_samples, apps) results = evaluate_code_samples(code_samples, apps)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment