Commit 8bf1b27d by nanziyuan

fix bugs

parent 9c8fbf86
import argparse
import os
from pathlib import Path
import pprint
from tqdm import tqdm
import torch
......@@ -11,10 +12,11 @@ import accelerate
from codecritic.utils.json import load_jsonl, save_jsonl
from codecritic.data.verify import get_score_token_id
from codecritic.utils.metric import group_results, score_pass_at_k
@torch.inference_mode()
def hf_score(model, tokenizer, prompts):
def hf_score(accelerator, model, tokenizer, prompts):
score_token = get_score_token_id(tokenizer)
with accelerator.split_between_processes(prompts) as partial_prompts:
results = []
......@@ -23,7 +25,7 @@ def hf_score(model, tokenizer, prompts):
input_ids = tokenizer.apply_chat_template(
item["messages"], add_generation_prompt=True, return_tensors="pt"
).to("cuda")
output = model(**input_ids)
output = model(input_ids)
next_token_logits = output.logits[0, -1, :]
score = F.softmax(next_token_logits, dim=0)[score_token].item()
......@@ -35,32 +37,43 @@ def hf_score(model, tokenizer, prompts):
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str)
parser.add_argument("--prompts", type=str)
parser.add_argument("--out", type=str)
parser.add_argument("--test", type=str)
parser.add_argument("--apps", type=str)
args = parser.parse_args()
home_path = Path(args.model).parent
result_dir = home_path / "hf_eval"
result_dir.mkdir(exist_ok=True)
prompts = load_jsonl(args.prompts)
prompts = load_jsonl(args.test)
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
accelerator = accelerate.Accelerator()
tokenizer = transformers.AutoTokenizer.from_pretrained(args.model)
model = transformers.AutoModelForCausalLM.from_pretrained(
args.model, device_map={"": accelerator.process_index}
)
model = transformers.AutoModelForCausalLM.from_pretrained(args.model, device_map="auto")
# model, tokenizer = accelerator.prepare(model, tokenizer)
for name, param in model.named_parameters():
print(f"{name}: {param.device}")
# model.generation_config.pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id
# model.generation_config.eos_token_id = tokenizer.eos_token_id
model.eval()
accelerator.wait_for_everyone()
results = hf_score(model, tokenizer, prompts)
results = hf_score(accelerator, model, tokenizer, prompts)
if accelerator.is_main_process:
save_jsonl(results, args.out)
score_path = result_dir / "scores.jsonl"
save_jsonl(results, score_path)
# compute pass@k
eval_result_path = result_dir / "passk.jsonl"
# results = load_jsonl(score_path)
groups = group_results(results, args.apps)
eval_results = [score_pass_at_k(groups, k, home_path.stem) for k in range(1, 16)]
save_jsonl(eval_results, eval_result_path)
pprint.pp(eval_results)
if __name__ == "__main__":
......
......@@ -15,14 +15,6 @@ from codecritic.utils.json import load_jsonl, save_jsonl
from codecritic.utils.metric import group_results, score_pass_at_k
def preprocess_test_item(item):
question = item["messages"][0]["content"]
answer = item["messages"][1]["content"]
code = code_template.format(extract_code(answer))
item["messages"] = mk_message(question, code)
return item
def append_prompt(item, content):
item["messages"].append({"role": "user", "content": content})
return item
......@@ -34,8 +26,7 @@ def run_sft_model(model_path, test_path, apps_path, reason_prompt, model_gpu):
result_dir.mkdir(exist_ok=True)
# preprocess prompt
raw_test_dataset = load_jsonl(test_path)
test_dataset = [preprocess_test_item(item) for item in raw_test_dataset]
test_dataset = load_jsonl(test_path)
# reason
if reason_prompt:
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment