Commit 38be25e7 by nzy

step4: refactor test reward & fix bug in code_template

parent 61469b06
import requests
import subprocess
import os
import json
import time
from tqdm.contrib.concurrent import thread_map
from copy import deepcopy
from utils import load_jsonl, save_jsonl, extract_code, read_config
from utils_metric import group_results, score_pass_at_k
from utils_dataset import code_template
from transformers import AutoTokenizer
def run_server(api_port, cuda_device, rm_inference_yaml_path, llamafactory_path):
env = os.environ.copy()
env["API_PORT"] = str(api_port)
env["CUDA_VISIBLE_DEVICES"] = str(cuda_device)
server_process = subprocess.Popen(
["llamafactory-cli", "api", rm_inference_yaml_path],
stdout=subprocess.PIPE,
env=env,
cwd=llamafactory_path,
text=True
)
for line in server_process.stdout:
if "start output" in line:
break #TODO
print(
f"Started server with PID {server_process.pid} on port {api_port} and CUDA device {cuda_device}"
)
return server_process
def start_servers(llamafactory_path, inference_cfg_path):
cuda_devices = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
gpu_num = len(cuda_devices)
server_processes = [
run_server(8000 + i, cuda_devices[i], inference_cfg_path, llamafactory_path)
for i in range(gpu_num)
]
time.sleep(10) # Wait for the servers to start (adjust the sleep time as needed)
return server_processes
def stop_servers(server_processes):
for server_process in server_processes:
server_process.terminate()
server_process.wait()
print(f"Terminated server with PID {server_process.pid}")
def get_rewards_from_server(server_url: str, messages: list[str]):
"""
Gets reward scores from the API server.
"""
headers = {"Content-Type": "application/json"}
payload = {"model": "model", "messages": messages}
response = requests.post(server_url, json=payload, headers=headers)
rewards = json.loads(response.text)["scores"]
return rewards
def preprocess_dataset(model_path, test_dataset, gpu_num):
"apply chat_template and split the dataset to different gpu"
tokenizer = AutoTokenizer.from_pretrained(model_path)
result = []
for i, item in enumerate(test_dataset):
messages = deepcopy(item["messages"])
messages[-1]["content"] = code_template.format(extract_code(messages[-1]["content"]))
# https://github.com/hiyouga/LLaMA-Factory/blob/a45f3f5461e2936b9e119eda2ef4d8c7a4131740/tests/data/test_template.py#L58
# # llama factory's template should match tokenizer's `apply_chat_template`.
item["format_str"] = [tokenizer.apply_chat_template(messages, tokenize=False)]
result.append((item, 8000 + i % gpu_num))
return result
def test_reward_model(item, api_port):
server_url = f"http://0.0.0.0:{api_port}/v1/score/evaluation"
score = get_rewards_from_server(server_url, item["format_str"])[0]
return {
"problem_id": item["problem_id"],
"messages": item["messages"],
"eval_result": item["eval_result"],
"score": score,
}
if __name__ == "__main__":
cfg = read_config(["orm_testmodel"])
orm_test_model = cfg["orm_testmodel"]
raw_test_dataset = load_jsonl(cfg["dataset"]["minimal_test_path"])
servers = start_servers(
cfg["llamafactory_path"],
cfg["orm"][orm_test_model]["inference_yaml_path"],
)
test_dataset = preprocess_dataset(
cfg["orm"][orm_test_model]["model_path"],
raw_test_dataset,
len(servers)
)
results = thread_map(lambda arg: test_reward_model(*arg),
test_dataset,
max_workers=len(servers))
save_jsonl(results, cfg["orm"][orm_test_model]["minimal_test_score_path"])
stop_servers(servers)
groups = group_results(results, cfg["apps"])
eval_results = [score_pass_at_k(groups, k, orm_test_model) for k in range(1, 32)]
save_jsonl(eval_results, cfg["orm"][orm_test_model]["eval_result_path"])
import argparse
from copy import deepcopy
import json
from tqdm import tqdm from tqdm import tqdm
from utils import load_jsonl, save_jsonl, read_config import requests
from transformers import AutoTokenizer
import pprint
from pathlib import Path
from utils import load_jsonl, save_jsonl, extract_code, code_template
from utils_metric import group_results, score_pass_at_k from utils_metric import group_results, score_pass_at_k
from step4_test_reward_model import preprocess_dataset, test_reward_model
if __name__ == "__main__": def get_rewards_from_server(server_url: str, messages: list[str]):
cfg = read_config(["orm_testmodel"]) """
orm_test_model = cfg["orm_testmodel"] Gets reward scores from the API server.
"""
headers = {"Content-Type": "application/json"}
payload = {"model": "model", "messages": messages}
response = requests.post(server_url, json=payload, headers=headers)
rewards = json.loads(response.text)["scores"]
return rewards
def preprocess_dataset(model_path, test_dataset, gpu_num):
"apply chat_template and split the dataset to different gpu"
tokenizer = AutoTokenizer.from_pretrained(model_path)
result = []
for i, item in enumerate(test_dataset):
messages = deepcopy(item["messages"])
messages[-1]["content"] = code_template.format(
extract_code(messages[-1]["content"])
)
# https://github.com/hiyouga/LLaMA-Factory/blob/a45f3f5461e2936b9e119eda2ef4d8c7a4131740/tests/data/test_template.py#L58
# # llama factory's template should match tokenizer's `apply_chat_template`.
item["format_str"] = [tokenizer.apply_chat_template(messages, tokenize=False)]
result.append((item, 8000 + i % gpu_num))
return result
raw_test_dataset = load_jsonl(cfg["dataset"]["minimal_test_path"]) def test_reward_model(item, api_port):
server_url = f"http://0.0.0.0:{api_port}/v1/score/evaluation"
score = get_rewards_from_server(server_url, item["format_str"])[0]
return {
"problem_id": item["problem_id"],
"messages": item["messages"],
"eval_result": item["eval_result"],
"score": score,
}
model_path = cfg["orm"][orm_test_model]["model_path"]
test_dataset = preprocess_dataset(model_path, raw_test_dataset, 1)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str)
parser.add_argument("--test", type=str)
parser.add_argument("--apps", type=str)
args = parser.parse_args()
home_path = Path(args.model).parent
result_dir = home_path / "eval"
result_dir.mkdir(exist_ok=True)
# compute score
score_path = result_dir / "scores.jsonl"
raw_test_dataset = load_jsonl(args.test)
test_dataset = preprocess_dataset(args.model, raw_test_dataset, 1)
results = [test_reward_model(*arg) for arg in tqdm(test_dataset)] results = [test_reward_model(*arg) for arg in tqdm(test_dataset)]
save_jsonl(results, cfg["orm"][orm_test_model]["minimal_test_score_path"]) save_jsonl(results, score_path)
# results = load_jsonl(result_path)
groups = group_results(results, cfg["apps"]) # compute pass@k
eval_results = [score_pass_at_k(groups, k, orm_test_model) for k in range(1, 16)] eval_result_path = result_dir / "passk.jsonl"
save_jsonl(eval_results, cfg["orm"][orm_test_model]["eval_result_path"]) # results = load_jsonl(result_path)
print(eval_results) groups = group_results(results, args.apps)
eval_results = [score_pass_at_k(groups, k, home_path.stem) for k in range(1, 16)]
save_jsonl(eval_results, eval_result_path)
pprint.pp(eval_results)
from utils_vllm import vllm_score from utils_vllm import vllm_score
from utils import read_config, load_jsonl, save_jsonl, extract_code from utils import read_config, load_jsonl, save_jsonl, extract_code, code_template
from utils_dataset import code_template, mk_critic_qa, mk_critic_verify from utils_dataset import mk_critic_qa, mk_critic_verify
from utils_metric import group_results, score_pass_at_k from utils_metric import group_results, score_pass_at_k
from transformers import AutoTokenizer from transformers import AutoTokenizer
......
from pathlib import Path
import json import json
import os
import re import re
from codebleu import calc_codebleu from codebleu import calc_codebleu
import sys import sys
...@@ -33,7 +31,10 @@ def save_json(data, file_path, indent=None): ...@@ -33,7 +31,10 @@ def save_json(data, file_path, indent=None):
codeblock_pattern = re.compile(r"```python(.+?)```", flags=re.DOTALL) codeblock_pattern = re.compile(r"```python(.+?)```", flags=re.DOTALL)
code_template = """```python
{}
```
"""
def extract_code(text: str): def extract_code(text: str):
codes = [match.strip() for match in re.findall(codeblock_pattern, text)] codes = [match.strip() for match in re.findall(codeblock_pattern, text)]
......
from utils import load_json, save_json from utils import load_json, save_json, code_template
def mk_preference_dataset_info(dataset_name): def mk_preference_dataset_info(dataset_name):
...@@ -23,11 +23,6 @@ def mk_preference_dataset_info(dataset_name): ...@@ -23,11 +23,6 @@ def mk_preference_dataset_info(dataset_name):
} }
# see utils.extract_code
# TODO Check the code format in dataset
code_template = r"```python{}```"
def mk_preference_pair(instruction, chosen_code, rejected_code): def mk_preference_pair(instruction, chosen_code, rejected_code):
return { return {
"messages": [ "messages": [
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment