Commit b319f162 by nanziyuan

fix gen_dataset bugs

parent b65ddb76
import argparse import argparse
import os
import json import json
from functools import partial from functools import partial
from collections import defaultdict from collections import defaultdict
...@@ -21,7 +22,7 @@ def transform_to_prompt(apps, tokenizer): ...@@ -21,7 +22,7 @@ def transform_to_prompt(apps, tokenizer):
for split in ["train", "test"]: for split in ["train", "test"]:
dataset = apps[split] dataset = apps[split]
for item in dataset: for item in dataset:
task_id = split + "-" + str(item["id"]) task_id = split + "-" + str(item["problem_id"])
try: try:
json.loads(item["input_output"]) json.loads(item["input_output"])
except ValueError: except ValueError:
...@@ -40,7 +41,7 @@ def transform_to_prompt(apps, tokenizer): ...@@ -40,7 +41,7 @@ def transform_to_prompt(apps, tokenizer):
prompts.append( prompts.append(
{ {
"dataset": "apps-" + item["difficulty"], "dataset": "apps-" + item["difficulty"],
"task_id": "task_id", "task_id": task_id,
"messages": prompt, "messages": prompt,
} }
) )
...@@ -58,6 +59,7 @@ if __name__ == "__main__": ...@@ -58,6 +59,7 @@ if __name__ == "__main__":
"--gpu", type=int, default=1, help="gpu number required by one model" "--gpu", type=int, default=1, help="gpu number required by one model"
) )
args = parser.parse_args() args = parser.parse_args()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
apps = load_dataset(args.apps) apps = load_dataset(args.apps)
tokenizer = AutoTokenizer.from_pretrained(args.model) tokenizer = AutoTokenizer.from_pretrained(args.model)
...@@ -79,10 +81,11 @@ if __name__ == "__main__": ...@@ -79,10 +81,11 @@ if __name__ == "__main__":
# postprocess # postprocess
grouped = defaultdict(list) grouped = defaultdict(list)
for sample in dataset: for sample in dataset:
grouped[sample["task_id"]] = sample grouped[sample["task_id"]].append(sample)
def is_in_test(task_id): def is_in_test(task_id):
split, idx = task_id.split("-") split, idx = task_id.split("-")
idx = int(idx)
if split == "test": if split == "test":
for start, end in [(0, 300), (3000, 3100), (4000, 4100)]: for start, end in [(0, 300), (3000, 3100), (4000, 4100)]:
if start <= idx < end: if start <= idx < end:
......
...@@ -28,7 +28,7 @@ def pass_at_k(samples, ks: list[int]): ...@@ -28,7 +28,7 @@ def pass_at_k(samples, ks: list[int]):
# groupby taskid # groupby taskid
grouped = defaultdict(list) grouped = defaultdict(list)
for sample in samples: for sample in samples:
grouped[sample["task_id"]] = sample grouped[sample["task_id"]].append(sample)
num_samples, num_correct = [], [] num_samples, num_correct = [], []
for task_id, group in grouped.items(): for task_id, group in grouped.items():
...@@ -65,7 +65,7 @@ def pos_neg_filter_uncertain(item, threshold): ...@@ -65,7 +65,7 @@ def pos_neg_filter_uncertain(item, threshold):
def top_at_k(samples, ks: list[int], score_func): def top_at_k(samples, ks: list[int], score_func):
grouped = defaultdict(list) grouped = defaultdict(list)
for sample in samples: for sample in samples:
grouped[sample["task_id"]] = sample grouped[sample["task_id"]].append(sample)
num_samples, first_pass_indices = [], [] num_samples, first_pass_indices = [], []
for task_id, group in grouped.items(): for task_id, group in grouped.items():
......
set -xe
model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/"
project="/lustre/S/nanziyuan/projects/ccc"
# APPS
CUDA_VISIBLE_DEVICES=0,1,2,3 \
python -m codecritic.cli.gen_dataset \
--model ${model} \
--apps /lustre/S/nanziyuan/datasets/apps/ \
--train "${project}/data/train/apps_train_samples.jsonl" \
--test "${project}/data/test/apps_test_samples.jsonl"
# HumanEval & MBPP
# HumanEvalPack
# BigCodeBench
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment