Commit ddc93279 by nzy

refact evaluation. This commit is just about test_genrm and test_orm. Others cannot work.

parent 4e3e7502
import argparse
import os
from pathlib import Path
import pprint
from codecritic.data.utils import mk_message
from codecritic.data.verify import JUDGE_PROMPT
from transformers import AutoTokenizer
from codecritic.data.code import extract_code, code_template
from codecritic.data.cov import COV_PROMPT
from codecritic.data.verify import get_score_token_id
from codecritic.utils.vllm import vllm_chatcomplete, vllm_score
from codecritic.utils.json import load_jsonl, save_jsonl
from codecritic.utils.metric import group_results, score_pass_at_k
def append_prompt(item, content):
item["messages"].append({"role": "user", "content": content})
return item
def run_sft_model(model_path, test_path, apps_path, reason_prompt, model_gpu):
home_path = Path(model_path).parent
result_dir = home_path / "eval"
result_dir.mkdir(exist_ok=True)
# preprocess prompt
test_dataset = load_jsonl(test_path)
# reason
if reason_prompt:
test_dataset = [append_prompt(x, COV_PROMPT) for x in test_dataset]
sampling_params = dict(n=1, temperature=0.0, max_tokens=2048)
test_dataset = vllm_chatcomplete(model_path, test_dataset, sampling_params, model_gpu)
# score
tokenizer = AutoTokenizer.from_pretrained(model_path)
score_token = get_score_token_id(tokenizer)
test_dataset = [append_prompt(x, JUDGE_PROMPT) for x in test_dataset]
results = vllm_score(model_path, test_dataset, score_token, model_gpu)
score_path = result_dir / "scores.jsonl"
save_jsonl(results, score_path)
# compute pass@k
eval_result_path = result_dir / "passk.jsonl"
# results = load_jsonl(score_path)
groups = group_results(results, apps_path)
eval_results = [score_pass_at_k(groups, k, home_path.stem) for k in range(1, 16)]
save_jsonl(eval_results, eval_result_path)
pprint.pp(eval_results)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str)
parser.add_argument("--test", type=str)
parser.add_argument("--apps", type=str)
parser.add_argument("--reason", choices=["cov"])
parser.add_argument("--gpu", type=int, default=1, help="gpu number required by model")
args = parser.parse_args()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
reason_prompts = {"cov": COV_PROMPT}
reason_prompt = reason_prompts.get(args.reason, None)
run_sft_model(args.model, args.test, args.apps, reason_prompt, args.gpu)
import argparse
from functools import partial
import os
from transformers import AutoTokenizer
from codecritic.data.genrm_prompt import JUDGE_PROMPT
from codecritic.evaluation.inference import generate_worker, score_worker
from codecritic.utils.parallel import model_map
from codecritic.utils.json import load_jsonl, save_jsonl
def append_prompt(item, content):
item["messages"].append({"role": "user", "content": content})
return item
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="path/to/model")
parser.add_argument("--sample", type=str, help="path/to/sample")
parser.add_argument("--output", type=str, help="path/to/score")
parser.add_argument("--reasoning", action="store_true", help="enable reasoning")
parser.add_argument(
"--gpu", type=int, default=1, help="gpu number required by model"
)
args = parser.parse_args()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
dataset = load_jsonl(args.sample)
if args.reasoning:
dataset = [append_prompt(x, COV_PROMPT) for x in dataset]
worker = partial(
generate_worker,
model_path=args.model,
n=1,
temperature=0,
max_tokens=4096,
)
dataset = model_map(worker, dataset, args.gpu_per_model)
tokenizer = AutoTokenizer.from_pretrained(args.model)
def get_token_id(token):
score_tokens = tokenizer.encode(token, add_special_tokens=False)
assert len(score_tokens) == 1
return score_tokens[0]
positive_token = get_token_id("Yes")
negative_token = get_token_id("No")
dataset = [append_prompt(x, JUDGE_PROMPT) for x in dataset]
worker = partial(
score_worker,
model_path=args.model,
positive_token=positive_token,
negative_token=negative_token
)
dataset = model_map(worker, dataset, args.gpu_per_model)
save_jsonl(dataset, args.output)
import argparse
import json
from tqdm import tqdm
import requests
from tqdm import tqdm
from transformers import AutoTokenizer
import pprint
from pathlib import Path
from codecritic.data.code import code_template, extract_code
from codecritic.utils.json import load_jsonl, save_jsonl
from codecritic.utils.metric import group_results, score_pass_at_k
def get_rewards_from_server(server_url: str, messages: list[str]):
......@@ -22,41 +18,21 @@ def get_rewards_from_server(server_url: str, messages: list[str]):
return rewards
def test_reward_model(server_url, item, tokenizer):
query = tokenizer.apply_chat_template(item["messages"], tokenize=False)
score = get_rewards_from_server(server_url, [query])[0]
return {
"problem_id": item["problem_id"],
"messages": item["messages"],
"eval_result": item["eval_result"],
"score": score,
}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str)
parser.add_argument("--test", type=str)
parser.add_argument("--apps", type=str)
parser.add_argument("--model", type=str, help="path/to/model")
parser.add_argument("--sample", type=str, help="path/to/sample")
parser.add_argument("--output", type=str, help="path/to/score")
args = parser.parse_args()
home_path = Path(args.model).parent
result_dir = home_path / "eval"
result_dir.mkdir(exist_ok=True)
# compute score
test_dataset = load_jsonl(args.test)
dataset = load_jsonl(args.sample)
server_url = "http://0.0.0.0:5000/get_reward"
tokenizer = AutoTokenizer.from_pretrained(args.model)
results = [test_reward_model(server_url, item, tokenizer) for item in tqdm(test_dataset)]
score_path = result_dir / "scores.jsonl"
save_jsonl(results, score_path)
for item in tqdm(dataset):
query = tokenizer.apply_chat_template(item["messages"], tokenize=False)
score = get_rewards_from_server(server_url, [query])[0]
item["score"] = score
# compute pass@k
results = load_jsonl(score_path)
groups = group_results(results, args.apps)
eval_results = [score_pass_at_k(groups, k, home_path.stem) for k in range(1, 16)]
eval_result_path = result_dir / "passk.jsonl"
save_jsonl(eval_results, eval_result_path)
pprint.pp(eval_results)
save_jsonl(dataset, args.output)
......@@ -11,7 +11,4 @@ def mk_critic_verify(answer=None):
return message
def get_score_token_id(tokenizer, token_str="Yes"):
score_tokens = tokenizer.encode(token_str, add_special_tokens=False)
assert len(score_tokens) == 1
return score_tokens[0]
# Note that the human and observation should appear in odd positions
# while llm should appear in even positions.
from codecritic.utils.json import save_jsonl
from pathlib import Path
def mk_message(user, assistant):
return [
{"role": "user", "content": user},
{"role": "assistant", "content": assistant},
]
# TODO This function can be removed
def save_jsonl_dataset(dataset, output_dir, split="train"):
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
save_jsonl(dataset, output_dir / f"{split}.jsonl")
SPLITTER = "__I_wish_it_were_weekends_all_the_time.__"
from functools import partial
import os
import numpy as np
from vllm import LLM, SamplingParams
from codecritic.data.utils import SPLITTER
SPLITTER = "__I_wish_it_were_weekends_all_the_time.__"
def generate_worker(cuda_device, prompts, model_path, sampling_params):
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(cuda_device)
llm = LLM(model=model_path,
seed=42,
max_model_len=8 * 1024,
swap_space=16,
tensor_parallel_size=len(cuda_device))
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(cuda_device)
llm = LLM(
model=model_path,
seed=42,
max_model_len=8 * 1024,
swap_space=16,
tensor_parallel_size=len(cuda_device),
)
tokenizer = llm.get_tokenizer()
stop_tokens = [tokenizer.eos_token_id]
print(f"SUCCESS: load llm {model_path} on cuda {cuda_device}")
vllm_sampling_params = SamplingParams(
n=sampling_params['n'],
temperature=sampling_params['temperature'],
n=sampling_params["n"],
temperature=sampling_params["temperature"],
top_p=0.95,
max_tokens=sampling_params['max_tokens'],
stop_token_ids=stop_tokens
max_tokens=sampling_params["max_tokens"],
stop_token_ids=stop_tokens,
)
print("Sampling params:", vllm_sampling_params)
def messages_to_text(messages):
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True
)
if SPLITTER in text:
text = text.split(SPLITTER)[0]
return text
text_prompts = [messages_to_text(item["messages"]) for item in prompts]
outputs = llm.generate(text_prompts, sampling_params=vllm_sampling_params, use_tqdm=True)
outputs = llm.generate(
text_prompts, sampling_params=vllm_sampling_params, use_tqdm=True
)
results = []
for item, output in zip(prompts, outputs):
......@@ -58,16 +64,14 @@ def generate_worker(cuda_device, prompts, model_path, sampling_params):
return results
def score_worker(cuda_device, prompts, model_path, score_token):
def score_worker(cuda_device, prompts, model_path, positive_token, negative_token=None):
def compute_score_onetoken(logprob):
positive_token = score_token[0]
def only_positive(logprob):
positive_logprob = logprob.get(positive_token)
positive_prob = np.exp(positive_logprob.logprob) if positive_logprob else 0
return {"score": positive_prob}
def compute_score_twotoken(logprob):
positive_token, negative_token = score_token[0], score_token[1]
def pos_and_neg(logprob):
positive_logprob = logprob.get(positive_token)
positive_prob = np.exp(positive_logprob.logprob) if positive_logprob else 0
......@@ -76,40 +80,40 @@ def score_worker(cuda_device, prompts, model_path, score_token):
return {
"score": positive_prob / (positive_prob + negative_prob),
"uncertainty": 1 - (positive_prob + negative_prob)
"uncertainty": 1 - (positive_prob + negative_prob),
}
if len(score_token) == 1:
compute_score = compute_score_onetoken
elif len(score_token) == 2:
compute_score = compute_score_twotoken
else:
raise NotImplementedError("param: score_token length shoud be 1 or 2")
compute_score = only_positive if negative_token is None else pos_and_neg
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(cuda_device)
os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(cuda_device)
llm = LLM(model=model_path,
seed=42,
max_model_len=8 * 1024,
swap_space=16,
tensor_parallel_size=len(cuda_device))
llm = LLM(
model=model_path,
seed=42,
max_model_len=8 * 1024,
swap_space=16,
tensor_parallel_size=len(cuda_device),
)
tokenizer = llm.get_tokenizer()
print(f"SUCCESS: load llm {model_path} on cuda {cuda_device}")
vllm_sampling_params = SamplingParams(
n=1,
temperature=0,
max_tokens=5,
logprobs=20
)
vllm_sampling_params = SamplingParams(n=1, temperature=0, max_tokens=5, logprobs=20)
text_prompts = [tokenizer.apply_chat_template(item["messages"], tokenize=False, add_generation_prompt=True) for item in prompts]
text_prompts = [
tokenizer.apply_chat_template(
item["messages"], tokenize=False, add_generation_prompt=True
)
for item in prompts
]
outputs = llm.generate(text_prompts, sampling_params=vllm_sampling_params, use_tqdm=False)
outputs = llm.generate(
text_prompts, sampling_params=vllm_sampling_params, use_tqdm=False
)
results = []
for item, output in zip(prompts, outputs):
assert len(output.outputs) == 1, "The scorer must provide a single score."
for response in output.outputs:
# response.logprobs: list[dict[int, Logprob]] https://github.com/vllm-project/vllm/blob/main/vllm/sequence.py
scores = compute_score(response.logprobs[0])
......
import json
from pathlib import Path
def ensure_parent(path):
path = Path(path)
path.parent.mkdir(parents=True, exist_ok=True)
......
import numpy as np
from datasets import load_dataset
from collections import defaultdict
def estimate_pass_at_k(
num_samples: list[int], num_correct: list[int], k: int
) -> np.ndarray:
"""
Estimates pass@k of each problem and returns them in an array.
"""
def estimator(n: int, c: int, k: int) -> float:
"""
Calculates 1 - comb(n - c, k) / comb(n, k).
"""
if n - c < k:
return 1.0
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
return np.array(
[estimator(int(n), int(c), k) for n, c in zip(num_samples, num_correct)]
)
def group_results(results, apps_path):
"""
Output
{
"interview": [
problem_id: [
{"problem_id", problem_id, "eval_result": True, ... },
...
],
...
],
...
}
"""
dataset = load_dataset(apps_path)
groups = defaultdict(lambda: defaultdict(list))
for item in results:
problem_id = item["problem_id"]
split, idx = problem_id.split("_")
difficulty = dataset[split][int(idx)]["difficulty"]
groups[difficulty][problem_id].append(item)
if "score" in results[0]:
for difficulty, problem in groups.items():
for problem_id, lst in problem.items():
sorted_lst = sorted(lst, key=lambda x: x["score"], reverse=True)
problem[problem_id] = sorted_lst
return groups
def pass_at_k(groups, k):
result = {"strategy": "pass@k", "k": k}
for difficulty, problems in groups.items():
num_samples, num_correct = [], []
for lst in problems.values():
num_samples.append(len(lst))
num_correct.append(sum(item["eval_result"] for item in lst))
pass_at_k = np.mean(estimate_pass_at_k(num_samples, num_correct, k))
result[difficulty] = pass_at_k
return result
def score_pass_at_k(groups, k, strategy):
result = {"strategy": strategy, "k": k}
for difficulty, problems in groups.items():
num_samples, num_correct = 0, 0
for lst in problems.values():
num_samples += 1
num_correct += any(item["eval_result"] for item in lst[:k])
pass_at_k = num_correct / num_samples
result[difficulty] = pass_at_k
return result
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment