Commit ddc93279 by nzy

refact evaluation. This commit is just about test_genrm and test_orm. Others cannot work.

parent 4e3e7502
import argparse
import os
from pathlib import Path
import pprint
from codecritic.data.utils import mk_message
from codecritic.data.verify import JUDGE_PROMPT
from transformers import AutoTokenizer
from codecritic.data.code import extract_code, code_template
from codecritic.data.cov import COV_PROMPT
from codecritic.data.verify import get_score_token_id
from codecritic.utils.vllm import vllm_chatcomplete, vllm_score
from codecritic.utils.json import load_jsonl, save_jsonl
from codecritic.utils.metric import group_results, score_pass_at_k
def append_prompt(item, content):
item["messages"].append({"role": "user", "content": content})
return item
def run_sft_model(model_path, test_path, apps_path, reason_prompt, model_gpu):
home_path = Path(model_path).parent
result_dir = home_path / "eval"
result_dir.mkdir(exist_ok=True)
# preprocess prompt
test_dataset = load_jsonl(test_path)
# reason
if reason_prompt:
test_dataset = [append_prompt(x, COV_PROMPT) for x in test_dataset]
sampling_params = dict(n=1, temperature=0.0, max_tokens=2048)
test_dataset = vllm_chatcomplete(model_path, test_dataset, sampling_params, model_gpu)
# score
tokenizer = AutoTokenizer.from_pretrained(model_path)
score_token = get_score_token_id(tokenizer)
test_dataset = [append_prompt(x, JUDGE_PROMPT) for x in test_dataset]
results = vllm_score(model_path, test_dataset, score_token, model_gpu)
score_path = result_dir / "scores.jsonl"
save_jsonl(results, score_path)
# compute pass@k
eval_result_path = result_dir / "passk.jsonl"
# results = load_jsonl(score_path)
groups = group_results(results, apps_path)
eval_results = [score_pass_at_k(groups, k, home_path.stem) for k in range(1, 16)]
save_jsonl(eval_results, eval_result_path)
pprint.pp(eval_results)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str)
parser.add_argument("--test", type=str)
parser.add_argument("--apps", type=str)
parser.add_argument("--reason", choices=["cov"])
parser.add_argument("--gpu", type=int, default=1, help="gpu number required by model")
args = parser.parse_args()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
reason_prompts = {"cov": COV_PROMPT}
reason_prompt = reason_prompts.get(args.reason, None)
run_sft_model(args.model, args.test, args.apps, reason_prompt, args.gpu)
import argparse
from functools import partial
import os
from transformers import AutoTokenizer
from codecritic.data.genrm_prompt import JUDGE_PROMPT
from codecritic.evaluation.inference import generate_worker, score_worker
from codecritic.utils.parallel import model_map
from codecritic.utils.json import load_jsonl, save_jsonl
def append_prompt(item, content):
item["messages"].append({"role": "user", "content": content})
return item
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="path/to/model")
parser.add_argument("--sample", type=str, help="path/to/sample")
parser.add_argument("--output", type=str, help="path/to/score")
parser.add_argument("--reasoning", action="store_true", help="enable reasoning")
parser.add_argument(
"--gpu", type=int, default=1, help="gpu number required by model"
)
args = parser.parse_args()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
dataset = load_jsonl(args.sample)
if args.reasoning:
dataset = [append_prompt(x, COV_PROMPT) for x in dataset]
worker = partial(
generate_worker,
model_path=args.model,
n=1,
temperature=0,
max_tokens=4096,
)
dataset = model_map(worker, dataset, args.gpu_per_model)
tokenizer = AutoTokenizer.from_pretrained(args.model)
def get_token_id(token):
score_tokens = tokenizer.encode(token, add_special_tokens=False)
assert len(score_tokens) == 1
return score_tokens[0]
positive_token = get_token_id("Yes")
negative_token = get_token_id("No")
dataset = [append_prompt(x, JUDGE_PROMPT) for x in dataset]
worker = partial(
score_worker,
model_path=args.model,
positive_token=positive_token,
negative_token=negative_token
)
dataset = model_map(worker, dataset, args.gpu_per_model)
save_jsonl(dataset, args.output)
import argparse import argparse
import json import json
from tqdm import tqdm
import requests import requests
from tqdm import tqdm
from transformers import AutoTokenizer from transformers import AutoTokenizer
import pprint
from pathlib import Path
from codecritic.data.code import code_template, extract_code
from codecritic.utils.json import load_jsonl, save_jsonl from codecritic.utils.json import load_jsonl, save_jsonl
from codecritic.utils.metric import group_results, score_pass_at_k
def get_rewards_from_server(server_url: str, messages: list[str]): def get_rewards_from_server(server_url: str, messages: list[str]):
...@@ -22,41 +18,21 @@ def get_rewards_from_server(server_url: str, messages: list[str]): ...@@ -22,41 +18,21 @@ def get_rewards_from_server(server_url: str, messages: list[str]):
return rewards return rewards
def test_reward_model(server_url, item, tokenizer):
query = tokenizer.apply_chat_template(item["messages"], tokenize=False)
score = get_rewards_from_server(server_url, [query])[0]
return {
"problem_id": item["problem_id"],
"messages": item["messages"],
"eval_result": item["eval_result"],
"score": score,
}
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str) parser.add_argument("--model", type=str, help="path/to/model")
parser.add_argument("--test", type=str) parser.add_argument("--sample", type=str, help="path/to/sample")
parser.add_argument("--apps", type=str) parser.add_argument("--output", type=str, help="path/to/score")
args = parser.parse_args() args = parser.parse_args()
home_path = Path(args.model).parent
result_dir = home_path / "eval"
result_dir.mkdir(exist_ok=True)
# compute score # compute score
test_dataset = load_jsonl(args.test) dataset = load_jsonl(args.sample)
server_url = "http://0.0.0.0:5000/get_reward" server_url = "http://0.0.0.0:5000/get_reward"
tokenizer = AutoTokenizer.from_pretrained(args.model) tokenizer = AutoTokenizer.from_pretrained(args.model)
results = [test_reward_model(server_url, item, tokenizer) for item in tqdm(test_dataset)]
score_path = result_dir / "scores.jsonl" for item in tqdm(dataset):
save_jsonl(results, score_path) query = tokenizer.apply_chat_template(item["messages"], tokenize=False)
score = get_rewards_from_server(server_url, [query])[0]
item["score"] = score
# compute pass@k save_jsonl(dataset, args.output)
results = load_jsonl(score_path)
groups = group_results(results, args.apps)
eval_results = [score_pass_at_k(groups, k, home_path.stem) for k in range(1, 16)]
eval_result_path = result_dir / "passk.jsonl"
save_jsonl(eval_results, eval_result_path)
pprint.pp(eval_results)
...@@ -11,7 +11,4 @@ def mk_critic_verify(answer=None): ...@@ -11,7 +11,4 @@ def mk_critic_verify(answer=None):
return message return message
def get_score_token_id(tokenizer, token_str="Yes"):
score_tokens = tokenizer.encode(token_str, add_special_tokens=False)
assert len(score_tokens) == 1
return score_tokens[0]
# Note that the human and observation should appear in odd positions
# while llm should appear in even positions.
from codecritic.utils.json import save_jsonl
from pathlib import Path
def mk_message(user, assistant):
return [
{"role": "user", "content": user},
{"role": "assistant", "content": assistant},
]
# TODO This function can be removed
def save_jsonl_dataset(dataset, output_dir, split="train"):
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
save_jsonl(dataset, output_dir / f"{split}.jsonl")
SPLITTER = "__I_wish_it_were_weekends_all_the_time.__"
from functools import partial
import os import os
import numpy as np
from vllm import LLM, SamplingParams from vllm import LLM, SamplingParams
from codecritic.data.utils import SPLITTER SPLITTER = "__I_wish_it_were_weekends_all_the_time.__"
def generate_worker(cuda_device, prompts, model_path, sampling_params): def generate_worker(cuda_device, prompts, model_path, sampling_params):
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(cuda_device) os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(cuda_device)
llm = LLM(model=model_path, llm = LLM(
seed=42, model=model_path,
max_model_len=8 * 1024, seed=42,
swap_space=16, max_model_len=8 * 1024,
tensor_parallel_size=len(cuda_device)) swap_space=16,
tensor_parallel_size=len(cuda_device),
)
tokenizer = llm.get_tokenizer() tokenizer = llm.get_tokenizer()
stop_tokens = [tokenizer.eos_token_id] stop_tokens = [tokenizer.eos_token_id]
print(f"SUCCESS: load llm {model_path} on cuda {cuda_device}") print(f"SUCCESS: load llm {model_path} on cuda {cuda_device}")
vllm_sampling_params = SamplingParams( vllm_sampling_params = SamplingParams(
n=sampling_params['n'], n=sampling_params["n"],
temperature=sampling_params['temperature'], temperature=sampling_params["temperature"],
top_p=0.95, top_p=0.95,
max_tokens=sampling_params['max_tokens'], max_tokens=sampling_params["max_tokens"],
stop_token_ids=stop_tokens stop_token_ids=stop_tokens,
) )
print("Sampling params:", vllm_sampling_params) print("Sampling params:", vllm_sampling_params)
def messages_to_text(messages): def messages_to_text(messages):
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
if SPLITTER in text: if SPLITTER in text:
text = text.split(SPLITTER)[0] text = text.split(SPLITTER)[0]
return text return text
text_prompts = [messages_to_text(item["messages"]) for item in prompts] text_prompts = [messages_to_text(item["messages"]) for item in prompts]
outputs = llm.generate(text_prompts, sampling_params=vllm_sampling_params, use_tqdm=True) outputs = llm.generate(
text_prompts, sampling_params=vllm_sampling_params, use_tqdm=True
)
results = [] results = []
for item, output in zip(prompts, outputs): for item, output in zip(prompts, outputs):
...@@ -58,16 +64,14 @@ def generate_worker(cuda_device, prompts, model_path, sampling_params): ...@@ -58,16 +64,14 @@ def generate_worker(cuda_device, prompts, model_path, sampling_params):
return results return results
def score_worker(cuda_device, prompts, model_path, score_token): def score_worker(cuda_device, prompts, model_path, positive_token, negative_token=None):
def compute_score_onetoken(logprob): def only_positive(logprob):
positive_token = score_token[0]
positive_logprob = logprob.get(positive_token) positive_logprob = logprob.get(positive_token)
positive_prob = np.exp(positive_logprob.logprob) if positive_logprob else 0 positive_prob = np.exp(positive_logprob.logprob) if positive_logprob else 0
return {"score": positive_prob} return {"score": positive_prob}
def compute_score_twotoken(logprob): def pos_and_neg(logprob):
positive_token, negative_token = score_token[0], score_token[1]
positive_logprob = logprob.get(positive_token) positive_logprob = logprob.get(positive_token)
positive_prob = np.exp(positive_logprob.logprob) if positive_logprob else 0 positive_prob = np.exp(positive_logprob.logprob) if positive_logprob else 0
...@@ -76,40 +80,40 @@ def score_worker(cuda_device, prompts, model_path, score_token): ...@@ -76,40 +80,40 @@ def score_worker(cuda_device, prompts, model_path, score_token):
return { return {
"score": positive_prob / (positive_prob + negative_prob), "score": positive_prob / (positive_prob + negative_prob),
"uncertainty": 1 - (positive_prob + negative_prob) "uncertainty": 1 - (positive_prob + negative_prob),
} }
if len(score_token) == 1: compute_score = only_positive if negative_token is None else pos_and_neg
compute_score = compute_score_onetoken
elif len(score_token) == 2:
compute_score = compute_score_twotoken
else:
raise NotImplementedError("param: score_token length shoud be 1 or 2")
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(cuda_device) os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(cuda_device)
llm = LLM(model=model_path, llm = LLM(
seed=42, model=model_path,
max_model_len=8 * 1024, seed=42,
swap_space=16, max_model_len=8 * 1024,
tensor_parallel_size=len(cuda_device)) swap_space=16,
tensor_parallel_size=len(cuda_device),
)
tokenizer = llm.get_tokenizer() tokenizer = llm.get_tokenizer()
print(f"SUCCESS: load llm {model_path} on cuda {cuda_device}") print(f"SUCCESS: load llm {model_path} on cuda {cuda_device}")
vllm_sampling_params = SamplingParams( vllm_sampling_params = SamplingParams(n=1, temperature=0, max_tokens=5, logprobs=20)
n=1,
temperature=0,
max_tokens=5,
logprobs=20
)
text_prompts = [tokenizer.apply_chat_template(item["messages"], tokenize=False, add_generation_prompt=True) for item in prompts] text_prompts = [
tokenizer.apply_chat_template(
item["messages"], tokenize=False, add_generation_prompt=True
)
for item in prompts
]
outputs = llm.generate(text_prompts, sampling_params=vllm_sampling_params, use_tqdm=False) outputs = llm.generate(
text_prompts, sampling_params=vllm_sampling_params, use_tqdm=False
)
results = [] results = []
for item, output in zip(prompts, outputs): for item, output in zip(prompts, outputs):
assert len(output.outputs) == 1, "The scorer must provide a single score."
for response in output.outputs: for response in output.outputs:
# response.logprobs: list[dict[int, Logprob]] https://github.com/vllm-project/vllm/blob/main/vllm/sequence.py # response.logprobs: list[dict[int, Logprob]] https://github.com/vllm-project/vllm/blob/main/vllm/sequence.py
scores = compute_score(response.logprobs[0]) scores = compute_score(response.logprobs[0])
......
import json import json
from pathlib import Path from pathlib import Path
def ensure_parent(path): def ensure_parent(path):
path = Path(path) path = Path(path)
path.parent.mkdir(parents=True, exist_ok=True) path.parent.mkdir(parents=True, exist_ok=True)
......
import numpy as np
from datasets import load_dataset
from collections import defaultdict
def estimate_pass_at_k(
num_samples: list[int], num_correct: list[int], k: int
) -> np.ndarray:
"""
Estimates pass@k of each problem and returns them in an array.
"""
def estimator(n: int, c: int, k: int) -> float:
"""
Calculates 1 - comb(n - c, k) / comb(n, k).
"""
if n - c < k:
return 1.0
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
return np.array(
[estimator(int(n), int(c), k) for n, c in zip(num_samples, num_correct)]
)
def group_results(results, apps_path):
"""
Output
{
"interview": [
problem_id: [
{"problem_id", problem_id, "eval_result": True, ... },
...
],
...
],
...
}
"""
dataset = load_dataset(apps_path)
groups = defaultdict(lambda: defaultdict(list))
for item in results:
problem_id = item["problem_id"]
split, idx = problem_id.split("_")
difficulty = dataset[split][int(idx)]["difficulty"]
groups[difficulty][problem_id].append(item)
if "score" in results[0]:
for difficulty, problem in groups.items():
for problem_id, lst in problem.items():
sorted_lst = sorted(lst, key=lambda x: x["score"], reverse=True)
problem[problem_id] = sorted_lst
return groups
def pass_at_k(groups, k):
result = {"strategy": "pass@k", "k": k}
for difficulty, problems in groups.items():
num_samples, num_correct = [], []
for lst in problems.values():
num_samples.append(len(lst))
num_correct.append(sum(item["eval_result"] for item in lst))
pass_at_k = np.mean(estimate_pass_at_k(num_samples, num_correct, k))
result[difficulty] = pass_at_k
return result
def score_pass_at_k(groups, k, strategy):
result = {"strategy": strategy, "k": k}
for difficulty, problems in groups.items():
num_samples, num_correct = 0, 0
for lst in problems.values():
num_samples += 1
num_correct += any(item["eval_result"] for item in lst[:k])
pass_at_k = num_correct / num_samples
result[difficulty] = pass_at_k
return result
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment