Commit 6d68e66f by nzy

refine test

parent ddc93279
......@@ -3,18 +3,14 @@ from functools import partial
import os
from transformers import AutoTokenizer
from vllm import SamplingParams
from codecritic.data.genrm_prompt import JUDGE_PROMPT
from codecritic.data.genrm_prompt import THINK_PROMPT, JUDGE_PROMPT, JUDGE_TOEKNS
from codecritic.evaluation.inference import generate_worker, score_worker
from codecritic.utils.parallel import model_map
from codecritic.utils.json import load_jsonl, save_jsonl
def append_prompt(item, content):
item["messages"].append({"role": "user", "content": content})
return item
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="path/to/model")
......@@ -22,43 +18,53 @@ if __name__ == "__main__":
parser.add_argument("--output", type=str, help="path/to/score")
parser.add_argument("--reasoning", action="store_true", help="enable reasoning")
parser.add_argument(
"--reason_max_tokens",
type=int,
default=4096,
help="maximum number of tokens allowed for the reasoning process.",
)
parser.add_argument(
"--gpu", type=int, default=1, help="gpu number required by model"
)
args = parser.parse_args()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
tokenizer = AutoTokenizer.from_pretrained(args.model)
dataset = load_jsonl(args.sample)
if args.reasoning:
dataset = [append_prompt(x, COV_PROMPT) for x in dataset]
worker = partial(
generate_worker,
model_path=args.model,
for item in dataset:
item["messages"].append({"role": "user", "content": THINK_PROMPT})
sampling_params = SamplingParams(
n=1,
temperature=0,
max_tokens=4096,
top_p=0.95,
max_tokens=args.reason_max_tokens,
)
dataset = model_map(worker, dataset, args.gpu_per_model)
worker = partial(
generate_worker, model_path=args.model, sampling_params=sampling_params
)
dataset = model_map(worker, dataset, args.gpu_per_model)
tokenizer = AutoTokenizer.from_pretrained(args.model)
def get_token_id(token):
score_tokens = tokenizer.encode(token, add_special_tokens=False)
assert len(score_tokens) == 1
return score_tokens[0]
positive_token = get_token_id("Yes")
negative_token = get_token_id("No")
dataset = [append_prompt(x, JUDGE_PROMPT) for x in dataset]
positive_token = get_token_id(JUDGE_TOEKNS["positive"])
negative_token = get_token_id(JUDGE_TOEKNS["negative"])
for item in dataset:
item["messages"].append({"role": "user", "content": JUDGE_PROMPT})
worker = partial(
score_worker,
model_path=args.model,
positive_token=positive_token,
negative_token=negative_token
negative_token=negative_token,
)
dataset = model_map(worker, dataset, args.gpu_per_model)
......
JUDGE_PROMPT = "Is the code correct (Yes/No)?"
JUDGE_TOEKNS = {
"positive": "Yes",
"negative": "No"
}
def mk_critic_verify(answer=None):
# answer: bool or none
......
......@@ -18,18 +18,6 @@ def generate_worker(cuda_device, prompts, model_path, sampling_params):
)
tokenizer = llm.get_tokenizer()
stop_tokens = [tokenizer.eos_token_id]
print(f"SUCCESS: load llm {model_path} on cuda {cuda_device}")
vllm_sampling_params = SamplingParams(
n=sampling_params["n"],
temperature=sampling_params["temperature"],
top_p=0.95,
max_tokens=sampling_params["max_tokens"],
stop_token_ids=stop_tokens,
)
print("Sampling params:", vllm_sampling_params)
def messages_to_text(messages):
text = tokenizer.apply_chat_template(
......@@ -41,9 +29,7 @@ def generate_worker(cuda_device, prompts, model_path, sampling_params):
text_prompts = [messages_to_text(item["messages"]) for item in prompts]
outputs = llm.generate(
text_prompts, sampling_params=vllm_sampling_params, use_tqdm=True
)
outputs = llm.generate(text_prompts, sampling_params=sampling_params, use_tqdm=True)
results = []
for item, output in zip(prompts, outputs):
......@@ -96,10 +82,6 @@ def score_worker(cuda_device, prompts, model_path, positive_token, negative_toke
)
tokenizer = llm.get_tokenizer()
print(f"SUCCESS: load llm {model_path} on cuda {cuda_device}")
vllm_sampling_params = SamplingParams(n=1, temperature=0, max_tokens=5, logprobs=20)
text_prompts = [
tokenizer.apply_chat_template(
item["messages"], tokenize=False, add_generation_prompt=True
......@@ -107,9 +89,8 @@ def score_worker(cuda_device, prompts, model_path, positive_token, negative_toke
for item in prompts
]
outputs = llm.generate(
text_prompts, sampling_params=vllm_sampling_params, use_tqdm=False
)
sampling_params = SamplingParams(n=1, temperature=0, max_tokens=5, logprobs=20)
outputs = llm.generate(text_prompts, sampling_params=sampling_params, use_tqdm=True)
results = []
for item, output in zip(prompts, outputs):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment