Commit 6d68e66f by nzy

refine test

parent ddc93279
...@@ -3,18 +3,14 @@ from functools import partial ...@@ -3,18 +3,14 @@ from functools import partial
import os import os
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm import SamplingParams
from codecritic.data.genrm_prompt import JUDGE_PROMPT from codecritic.data.genrm_prompt import THINK_PROMPT, JUDGE_PROMPT, JUDGE_TOEKNS
from codecritic.evaluation.inference import generate_worker, score_worker from codecritic.evaluation.inference import generate_worker, score_worker
from codecritic.utils.parallel import model_map from codecritic.utils.parallel import model_map
from codecritic.utils.json import load_jsonl, save_jsonl from codecritic.utils.json import load_jsonl, save_jsonl
def append_prompt(item, content):
item["messages"].append({"role": "user", "content": content})
return item
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="path/to/model") parser.add_argument("--model", type=str, help="path/to/model")
...@@ -22,43 +18,53 @@ if __name__ == "__main__": ...@@ -22,43 +18,53 @@ if __name__ == "__main__":
parser.add_argument("--output", type=str, help="path/to/score") parser.add_argument("--output", type=str, help="path/to/score")
parser.add_argument("--reasoning", action="store_true", help="enable reasoning") parser.add_argument("--reasoning", action="store_true", help="enable reasoning")
parser.add_argument( parser.add_argument(
"--reason_max_tokens",
type=int,
default=4096,
help="maximum number of tokens allowed for the reasoning process.",
)
parser.add_argument(
"--gpu", type=int, default=1, help="gpu number required by model" "--gpu", type=int, default=1, help="gpu number required by model"
) )
args = parser.parse_args() args = parser.parse_args()
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
tokenizer = AutoTokenizer.from_pretrained(args.model)
dataset = load_jsonl(args.sample) dataset = load_jsonl(args.sample)
if args.reasoning: if args.reasoning:
dataset = [append_prompt(x, COV_PROMPT) for x in dataset] for item in dataset:
worker = partial( item["messages"].append({"role": "user", "content": THINK_PROMPT})
generate_worker,
model_path=args.model, sampling_params = SamplingParams(
n=1, n=1,
temperature=0, temperature=0,
max_tokens=4096, top_p=0.95,
max_tokens=args.reason_max_tokens,
) )
dataset = model_map(worker, dataset, args.gpu_per_model)
worker = partial(
generate_worker, model_path=args.model, sampling_params=sampling_params
)
dataset = model_map(worker, dataset, args.gpu_per_model)
tokenizer = AutoTokenizer.from_pretrained(args.model)
def get_token_id(token): def get_token_id(token):
score_tokens = tokenizer.encode(token, add_special_tokens=False) score_tokens = tokenizer.encode(token, add_special_tokens=False)
assert len(score_tokens) == 1 assert len(score_tokens) == 1
return score_tokens[0] return score_tokens[0]
positive_token = get_token_id("Yes") positive_token = get_token_id(JUDGE_TOEKNS["positive"])
negative_token = get_token_id("No") negative_token = get_token_id(JUDGE_TOEKNS["negative"])
dataset = [append_prompt(x, JUDGE_PROMPT) for x in dataset]
for item in dataset:
item["messages"].append({"role": "user", "content": JUDGE_PROMPT})
worker = partial( worker = partial(
score_worker, score_worker,
model_path=args.model, model_path=args.model,
positive_token=positive_token, positive_token=positive_token,
negative_token=negative_token negative_token=negative_token,
) )
dataset = model_map(worker, dataset, args.gpu_per_model) dataset = model_map(worker, dataset, args.gpu_per_model)
......
JUDGE_PROMPT = "Is the code correct (Yes/No)?" JUDGE_PROMPT = "Is the code correct (Yes/No)?"
JUDGE_TOEKNS = {
"positive": "Yes",
"negative": "No"
}
def mk_critic_verify(answer=None): def mk_critic_verify(answer=None):
# answer: bool or none # answer: bool or none
......
...@@ -18,18 +18,6 @@ def generate_worker(cuda_device, prompts, model_path, sampling_params): ...@@ -18,18 +18,6 @@ def generate_worker(cuda_device, prompts, model_path, sampling_params):
) )
tokenizer = llm.get_tokenizer() tokenizer = llm.get_tokenizer()
stop_tokens = [tokenizer.eos_token_id]
print(f"SUCCESS: load llm {model_path} on cuda {cuda_device}")
vllm_sampling_params = SamplingParams(
n=sampling_params["n"],
temperature=sampling_params["temperature"],
top_p=0.95,
max_tokens=sampling_params["max_tokens"],
stop_token_ids=stop_tokens,
)
print("Sampling params:", vllm_sampling_params)
def messages_to_text(messages): def messages_to_text(messages):
text = tokenizer.apply_chat_template( text = tokenizer.apply_chat_template(
...@@ -41,9 +29,7 @@ def generate_worker(cuda_device, prompts, model_path, sampling_params): ...@@ -41,9 +29,7 @@ def generate_worker(cuda_device, prompts, model_path, sampling_params):
text_prompts = [messages_to_text(item["messages"]) for item in prompts] text_prompts = [messages_to_text(item["messages"]) for item in prompts]
outputs = llm.generate( outputs = llm.generate(text_prompts, sampling_params=sampling_params, use_tqdm=True)
text_prompts, sampling_params=vllm_sampling_params, use_tqdm=True
)
results = [] results = []
for item, output in zip(prompts, outputs): for item, output in zip(prompts, outputs):
...@@ -96,10 +82,6 @@ def score_worker(cuda_device, prompts, model_path, positive_token, negative_toke ...@@ -96,10 +82,6 @@ def score_worker(cuda_device, prompts, model_path, positive_token, negative_toke
) )
tokenizer = llm.get_tokenizer() tokenizer = llm.get_tokenizer()
print(f"SUCCESS: load llm {model_path} on cuda {cuda_device}")
vllm_sampling_params = SamplingParams(n=1, temperature=0, max_tokens=5, logprobs=20)
text_prompts = [ text_prompts = [
tokenizer.apply_chat_template( tokenizer.apply_chat_template(
item["messages"], tokenize=False, add_generation_prompt=True item["messages"], tokenize=False, add_generation_prompt=True
...@@ -107,9 +89,8 @@ def score_worker(cuda_device, prompts, model_path, positive_token, negative_toke ...@@ -107,9 +89,8 @@ def score_worker(cuda_device, prompts, model_path, positive_token, negative_toke
for item in prompts for item in prompts
] ]
outputs = llm.generate( sampling_params = SamplingParams(n=1, temperature=0, max_tokens=5, logprobs=20)
text_prompts, sampling_params=vllm_sampling_params, use_tqdm=False outputs = llm.generate(text_prompts, sampling_params=sampling_params, use_tqdm=True)
)
results = [] results = []
for item, output in zip(prompts, outputs): for item, output in zip(prompts, outputs):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment