Commit c32b630e by nzy

reformat dataset

parent a4c9cab3
# CodeCritic
## Evaluation
### APPS-500 (Top@k)
### HumanEval (Top@k)
### MBPP (Top@k)
### HumanEvalPack-Python (Pairwise)
### BigCodeBench (TODO)
### Dataset Format
```json
{
"dataset": "string, name or identifier of the dataset",
"task_id": "integer or string, identifier for the task",
"solution_id": "integer or string, unique identifier for the solution",
"pass": "boolean, indicates whether the solution passed the task",
"skip": "boolean, set to True if no solution passes this task",
"messages": "list of dictionaries, conversation messages in OpenAI format",
"positive_score": "float, probability of the 'Yes' token",
"negative_score": "float, probability of the 'No' token",
"meta_***": "any additional data or custom fields",
}
```
###
\ No newline at end of file
import argparse
from codecritic.dataset.utils import mk_message, save_jsonl_dataset
from codecritic.utils.json import load_jsonl
from codecritic.dataset.verify import mk_critic_verify
def mk_sft_dataset(messages):
question = messages[:-1]
response = messages[-1:]
return dict(question=question, response=response)
def convert_preference_to_sft(item):
message = item["messages"]
chosen = item["chosen"]
rejected = item["rejected"]
messages1 = message + chosen + mk_critic_verify(True)
messages2 = message + rejected + mk_critic_verify(False)
return mk_sft_dataset(messages1), mk_sft_dataset(messages2)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--preference_dataset", type=str)
parser.add_argument("--output_dir", type=str)
parser.add_argument("--split", type=str, default="train")
args = parser.parse_args()
preference_dataset = load_jsonl(args.preference_dataset)
sft_dataset = []
for item in preference_dataset:
sft_dataset.extend(convert_preference_to_sft(item))
save_jsonl_dataset(sft_dataset, args.output_dir, args.split)
import argparse import argparse
from itertools import chain
from codecritic.dataset.cov import (
convert_preference_to_vot_prompt,
convert_cov_to_cov_dataset,
)
from codecritic.utils.json import load_json
from codecritic.dataset.utils import save_jsonl_dataset
from codecritic.utils.vllm import vllm_chatcomplete
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str)
parser.add_argument("--preference_dataset", type=str)
parser.add_argument("--output_dir", type=str)
args = parser.parse_args()
preference_dataset = load_json(args.preference_dataset)
cov_prompts = [convert_preference_to_vot_prompt(x) for x in preference_dataset]
cov_prompts = list(chain(*cov_prompts))
sampling_params = dict(n=1, temperature=0.0, max_tokens=2048)
covs = vllm_chatcomplete(args.model, cov_prompts, sampling_params)
dataset = list(map(convert_cov_to_cov_dataset, covs))
save_jsonl_dataset(dataset, args.output_dir)
import argparse
import datetime import datetime
from itertools import chain from itertools import chain
from functools import partial from functools import partial
......
import argparse
from itertools import chain
from codecritic.dataset.cov import (
convert_preference_to_vot_prompt,
convert_cov_to_cov_dataset,
)
from codecritic.utils.json import load_json
from codecritic.dataset.utils import save_jsonl_dataset
from codecritic.utils.vllm import vllm_chatcomplete
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str)
parser.add_argument("--preference_dataset", type=str)
parser.add_argument("--output_dir", type=str)
args = parser.parse_args()
preference_dataset = load_json(args.preference_dataset)
cov_prompts = [convert_preference_to_vot_prompt(x) for x in preference_dataset]
cov_prompts = list(chain(*cov_prompts))
sampling_params = dict(n=1, temperature=0.0, max_tokens=2048)
covs = vllm_chatcomplete(args.model, cov_prompts, sampling_params)
dataset = list(map(convert_cov_to_cov_dataset, covs))
save_jsonl_dataset(dataset, args.output_dir)
...@@ -5,8 +5,8 @@ import os ...@@ -5,8 +5,8 @@ import os
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm import SamplingParams from vllm import SamplingParams
from codecritic.dataset.genrm_prompt import THINK_PROMPT, JUDGE_PROMPT, JUDGE_TOEKNS from codecritic.dataset.genrm_prompt import THINK_MESSAGE, JUDGE_MESSAGE, JUDGE_TOEKNS
from codecritic.evaluation.inference import generate_worker, score_worker from codecritic.utils.inference import generate_worker, score_worker
from codecritic.utils.parallel import model_map from codecritic.utils.parallel import model_map
from codecritic.utils.json import load_jsonl, save_jsonl from codecritic.utils.json import load_jsonl, save_jsonl
...@@ -35,7 +35,7 @@ if __name__ == "__main__": ...@@ -35,7 +35,7 @@ if __name__ == "__main__":
if args.reasoning: if args.reasoning:
for item in dataset: for item in dataset:
item["messages"].append({"role": "user", "content": THINK_PROMPT}) item["messages"].append(THINK_MESSAGE)
sampling_params = SamplingParams( sampling_params = SamplingParams(
n=1, n=1,
...@@ -58,7 +58,7 @@ if __name__ == "__main__": ...@@ -58,7 +58,7 @@ if __name__ == "__main__":
negative_token = get_token_id(JUDGE_TOEKNS["negative"]) negative_token = get_token_id(JUDGE_TOEKNS["negative"])
for item in dataset: for item in dataset:
item["messages"].append({"role": "user", "content": JUDGE_PROMPT}) item["messages"].append(JUDGE_MESSAGE)
worker = partial( worker = partial(
score_worker, score_worker,
......
JUDGE_PROMPT = "Is the code correct (Yes/No)?" JUDGE_PROMPT = "Is the code correct (Yes/No)?"
JUDGE_MESSAGE = {"role": "user", "content": JUDGE_PROMPT}
JUDGE_TOEKNS = { JUDGE_TOEKNS = {
"positive": "Yes", "positive": "Yes",
"negative": "No" "negative": "No"
} }
def mk_critic_verify(answer=None): def mk_judge_response(response):
# answer: bool or none "response: positive or negative"
message = [{"role": "user", "content": JUDGE_PROMPT}] return {"role": "assistant", "content": JUDGE_TOEKNS[response]}
if answer is not None:
response = "Yes" if answer else "No"
message.append({"role": "assistant", "content": response})
return message
THINK_PROMPT = ""
THINK_MESSAGE = {"role": "user", "content": THINK_PROMPT}
import codecritic.dataset.genrm_prompt as genrm_prompt
def reward_to_sft(item):
messages = item.pop("messages")
chosen_response = item.pop("chosen")
rejected_response = item.pop("rejected")
chosen = {
**item,
"question": messages + chosen_response + [genrm_prompt.JUDGE_MESSAGE],
"response": genrm_prompt.mk_judge_response("positive"),
}
rejected = {
**item,
"question": messages + rejected_response + [genrm_prompt.JUDGE_MESSAGE],
"response": genrm_prompt.mk_judge_response("negative"),
}
return chosen, rejected
"""
changing the format of the dataset.
"""
\ No newline at end of file
...@@ -51,26 +51,6 @@ def generate_worker(cuda_device, prompts, model_path, sampling_params): ...@@ -51,26 +51,6 @@ def generate_worker(cuda_device, prompts, model_path, sampling_params):
def score_worker(cuda_device, prompts, model_path, positive_token, negative_token=None): def score_worker(cuda_device, prompts, model_path, positive_token, negative_token=None):
def only_positive(logprob):
positive_logprob = logprob.get(positive_token)
positive_prob = np.exp(positive_logprob.logprob) if positive_logprob else 0
return {"score": positive_prob}
def pos_and_neg(logprob):
positive_logprob = logprob.get(positive_token)
positive_prob = np.exp(positive_logprob.logprob) if positive_logprob else 0
negative_logprob = logprob.get(negative_token)
negative_prob = np.exp(negative_logprob.logprob) if negative_logprob else 0
return {
"score": positive_prob / (positive_prob + negative_prob),
"uncertainty": 1 - (positive_prob + negative_prob),
}
compute_score = only_positive if negative_token is None else pos_and_neg
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(cuda_device) os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(cuda_device)
llm = LLM( llm = LLM(
...@@ -97,9 +77,21 @@ def score_worker(cuda_device, prompts, model_path, positive_token, negative_toke ...@@ -97,9 +77,21 @@ def score_worker(cuda_device, prompts, model_path, positive_token, negative_toke
assert len(output.outputs) == 1, "The scorer must provide a single score." assert len(output.outputs) == 1, "The scorer must provide a single score."
for response in output.outputs: for response in output.outputs:
# response.logprobs: list[dict[int, Logprob]] https://github.com/vllm-project/vllm/blob/main/vllm/sequence.py # response.logprobs: list[dict[int, Logprob]] https://github.com/vllm-project/vllm/blob/main/vllm/sequence.py
scores = compute_score(response.logprobs[0]) logprob = response.logprobs[0]
positive_logprob = logprob.get(positive_token)
positive_prob = np.exp(positive_logprob.logprob) if positive_logprob else 0
if negative_token:
negative_logprob = logprob.get(negative_token)
negative_prob = np.exp(negative_logprob.logprob) if negative_logprob else 0
else:
negative_prob = None
text = response.text text = response.text
results.append({**item, **scores, "critic_text": text}) result = item.copy()
result["positive_score"] = positive_prob
result["negative_score"] = negative_prob
result["meta_score_response"] = text
results.append(result)
return results return results
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment