Commit c32b630e by nzy

reformat dataset

parent a4c9cab3
# CodeCritic
## Evaluation
### APPS-500 (Top@k)
### HumanEval (Top@k)
### MBPP (Top@k)
### HumanEvalPack-Python (Pairwise)
### BigCodeBench (TODO)
### Dataset Format
```json
{
"dataset": "string, name or identifier of the dataset",
"task_id": "integer or string, identifier for the task",
"solution_id": "integer or string, unique identifier for the solution",
"pass": "boolean, indicates whether the solution passed the task",
"skip": "boolean, set to True if no solution passes this task",
"messages": "list of dictionaries, conversation messages in OpenAI format",
"positive_score": "float, probability of the 'Yes' token",
"negative_score": "float, probability of the 'No' token",
"meta_***": "any additional data or custom fields",
}
```
###
\ No newline at end of file
import argparse
from codecritic.dataset.utils import mk_message, save_jsonl_dataset
from codecritic.utils.json import load_jsonl
from codecritic.dataset.verify import mk_critic_verify
def mk_sft_dataset(messages):
question = messages[:-1]
response = messages[-1:]
return dict(question=question, response=response)
def convert_preference_to_sft(item):
message = item["messages"]
chosen = item["chosen"]
rejected = item["rejected"]
messages1 = message + chosen + mk_critic_verify(True)
messages2 = message + rejected + mk_critic_verify(False)
return mk_sft_dataset(messages1), mk_sft_dataset(messages2)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--preference_dataset", type=str)
parser.add_argument("--output_dir", type=str)
parser.add_argument("--split", type=str, default="train")
args = parser.parse_args()
preference_dataset = load_jsonl(args.preference_dataset)
sft_dataset = []
for item in preference_dataset:
sft_dataset.extend(convert_preference_to_sft(item))
save_jsonl_dataset(sft_dataset, args.output_dir, args.split)
import argparse import argparse
from itertools import chain
from codecritic.dataset.cov import (
convert_preference_to_vot_prompt,
convert_cov_to_cov_dataset,
)
from codecritic.utils.json import load_json
from codecritic.dataset.utils import save_jsonl_dataset
from codecritic.utils.vllm import vllm_chatcomplete
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str)
parser.add_argument("--preference_dataset", type=str)
parser.add_argument("--output_dir", type=str)
args = parser.parse_args()
preference_dataset = load_json(args.preference_dataset)
cov_prompts = [convert_preference_to_vot_prompt(x) for x in preference_dataset]
cov_prompts = list(chain(*cov_prompts))
sampling_params = dict(n=1, temperature=0.0, max_tokens=2048)
covs = vllm_chatcomplete(args.model, cov_prompts, sampling_params)
dataset = list(map(convert_cov_to_cov_dataset, covs))
save_jsonl_dataset(dataset, args.output_dir)
import argparse
import datetime import datetime
from itertools import chain from itertools import chain
from functools import partial from functools import partial
...@@ -66,4 +96,4 @@ if __name__ == "__main__": ...@@ -66,4 +96,4 @@ if __name__ == "__main__":
save_jsonl(covs, args.out + ".raw") save_jsonl(covs, args.out + ".raw")
dataset = [convert_cov_to_cov_dataset(x, args.mode) for x in covs] dataset = [convert_cov_to_cov_dataset(x, args.mode) for x in covs]
save_jsonl(dataset, args.out) save_jsonl(dataset, args.out)
\ No newline at end of file
import argparse
from itertools import chain
from codecritic.dataset.cov import (
convert_preference_to_vot_prompt,
convert_cov_to_cov_dataset,
)
from codecritic.utils.json import load_json
from codecritic.dataset.utils import save_jsonl_dataset
from codecritic.utils.vllm import vllm_chatcomplete
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str)
parser.add_argument("--preference_dataset", type=str)
parser.add_argument("--output_dir", type=str)
args = parser.parse_args()
preference_dataset = load_json(args.preference_dataset)
cov_prompts = [convert_preference_to_vot_prompt(x) for x in preference_dataset]
cov_prompts = list(chain(*cov_prompts))
sampling_params = dict(n=1, temperature=0.0, max_tokens=2048)
covs = vllm_chatcomplete(args.model, cov_prompts, sampling_params)
dataset = list(map(convert_cov_to_cov_dataset, covs))
save_jsonl_dataset(dataset, args.output_dir)
...@@ -5,8 +5,8 @@ import os ...@@ -5,8 +5,8 @@ import os
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm import SamplingParams from vllm import SamplingParams
from codecritic.dataset.genrm_prompt import THINK_PROMPT, JUDGE_PROMPT, JUDGE_TOEKNS from codecritic.dataset.genrm_prompt import THINK_MESSAGE, JUDGE_MESSAGE, JUDGE_TOEKNS
from codecritic.evaluation.inference import generate_worker, score_worker from codecritic.utils.inference import generate_worker, score_worker
from codecritic.utils.parallel import model_map from codecritic.utils.parallel import model_map
from codecritic.utils.json import load_jsonl, save_jsonl from codecritic.utils.json import load_jsonl, save_jsonl
...@@ -35,7 +35,7 @@ if __name__ == "__main__": ...@@ -35,7 +35,7 @@ if __name__ == "__main__":
if args.reasoning: if args.reasoning:
for item in dataset: for item in dataset:
item["messages"].append({"role": "user", "content": THINK_PROMPT}) item["messages"].append(THINK_MESSAGE)
sampling_params = SamplingParams( sampling_params = SamplingParams(
n=1, n=1,
...@@ -58,7 +58,7 @@ if __name__ == "__main__": ...@@ -58,7 +58,7 @@ if __name__ == "__main__":
negative_token = get_token_id(JUDGE_TOEKNS["negative"]) negative_token = get_token_id(JUDGE_TOEKNS["negative"])
for item in dataset: for item in dataset:
item["messages"].append({"role": "user", "content": JUDGE_PROMPT}) item["messages"].append(JUDGE_MESSAGE)
worker = partial( worker = partial(
score_worker, score_worker,
......
JUDGE_PROMPT = "Is the code correct (Yes/No)?" JUDGE_PROMPT = "Is the code correct (Yes/No)?"
JUDGE_MESSAGE = {"role": "user", "content": JUDGE_PROMPT}
JUDGE_TOEKNS = { JUDGE_TOEKNS = {
"positive": "Yes", "positive": "Yes",
"negative": "No" "negative": "No"
} }
def mk_critic_verify(answer=None): def mk_judge_response(response):
# answer: bool or none "response: positive or negative"
message = [{"role": "user", "content": JUDGE_PROMPT}] return {"role": "assistant", "content": JUDGE_TOEKNS[response]}
if answer is not None:
response = "Yes" if answer else "No"
message.append({"role": "assistant", "content": response})
return message
THINK_PROMPT = ""
THINK_MESSAGE = {"role": "user", "content": THINK_PROMPT}
import codecritic.dataset.genrm_prompt as genrm_prompt
def reward_to_sft(item):
messages = item.pop("messages")
chosen_response = item.pop("chosen")
rejected_response = item.pop("rejected")
chosen = {
**item,
"question": messages + chosen_response + [genrm_prompt.JUDGE_MESSAGE],
"response": genrm_prompt.mk_judge_response("positive"),
}
rejected = {
**item,
"question": messages + rejected_response + [genrm_prompt.JUDGE_MESSAGE],
"response": genrm_prompt.mk_judge_response("negative"),
}
return chosen, rejected
"""
changing the format of the dataset.
"""
\ No newline at end of file
...@@ -51,26 +51,6 @@ def generate_worker(cuda_device, prompts, model_path, sampling_params): ...@@ -51,26 +51,6 @@ def generate_worker(cuda_device, prompts, model_path, sampling_params):
def score_worker(cuda_device, prompts, model_path, positive_token, negative_token=None): def score_worker(cuda_device, prompts, model_path, positive_token, negative_token=None):
def only_positive(logprob):
positive_logprob = logprob.get(positive_token)
positive_prob = np.exp(positive_logprob.logprob) if positive_logprob else 0
return {"score": positive_prob}
def pos_and_neg(logprob):
positive_logprob = logprob.get(positive_token)
positive_prob = np.exp(positive_logprob.logprob) if positive_logprob else 0
negative_logprob = logprob.get(negative_token)
negative_prob = np.exp(negative_logprob.logprob) if negative_logprob else 0
return {
"score": positive_prob / (positive_prob + negative_prob),
"uncertainty": 1 - (positive_prob + negative_prob),
}
compute_score = only_positive if negative_token is None else pos_and_neg
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(cuda_device) os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(cuda_device)
llm = LLM( llm = LLM(
...@@ -97,9 +77,21 @@ def score_worker(cuda_device, prompts, model_path, positive_token, negative_toke ...@@ -97,9 +77,21 @@ def score_worker(cuda_device, prompts, model_path, positive_token, negative_toke
assert len(output.outputs) == 1, "The scorer must provide a single score." assert len(output.outputs) == 1, "The scorer must provide a single score."
for response in output.outputs: for response in output.outputs:
# response.logprobs: list[dict[int, Logprob]] https://github.com/vllm-project/vllm/blob/main/vllm/sequence.py # response.logprobs: list[dict[int, Logprob]] https://github.com/vllm-project/vllm/blob/main/vllm/sequence.py
scores = compute_score(response.logprobs[0]) logprob = response.logprobs[0]
positive_logprob = logprob.get(positive_token)
positive_prob = np.exp(positive_logprob.logprob) if positive_logprob else 0
if negative_token:
negative_logprob = logprob.get(negative_token)
negative_prob = np.exp(negative_logprob.logprob) if negative_logprob else 0
else:
negative_prob = None
text = response.text text = response.text
results.append({**item, **scores, "critic_text": text}) result = item.copy()
result["positive_score"] = positive_prob
result["negative_score"] = negative_prob
result["meta_score_response"] = text
results.append(result)
return results return results
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment