Commit c72d23fc by nanziyuan

step4: test_reward_model Replace multi-threaded request handling with single-threaded approach

- Addresses occasional deadlocks in multi-threaded model server requests.
- Implemented a single-threaded, single-server setup for testing purposes.
- Observed that the Reward model's performance is acceptable and can be used.
- Temporarily adopting this single-threaded version
parent 52fa8d54
from tqdm import tqdm
import requests
import json
from utils import load_jsonl, extract_code, save_jsonl, read_config
from utils_metric import group_results, score_pass_at_k
from transformers import AutoTokenizer
def get_rewards_from_server(server_url: str, messages: list[str]):
"""
Gets reward scores from the API server.
"""
headers = {"Content-Type": "application/json"}
payload = {"model": "model", "messages": messages}
response = requests.post(server_url, json=payload, headers=headers)
rewards = json.loads(response.text)["scores"]
return rewards
def reward_model_inference(item):
server_url = f"http://0.0.0.0:8000/v1/score/evaluation"
score = get_rewards_from_server(server_url, item["format_str"])[0]
return {
"problem_id": item["problem_id"],
"messages": item["messages"],
"eval_result": item["eval_result"],
"score": score,
}
def preprocess_dataset(model_path, test_dataset):
"apply chat_template and split the dataset to different gpu"
tokenizer = AutoTokenizer.from_pretrained(model_path)
result = []
for i, item in enumerate(test_dataset):
messages = item["messages"]
messages[-1]["content"] = extract_code(messages[-1]["content"])
# https://github.com/hiyouga/LLaMA-Factory/blob/a45f3f5461e2936b9e119eda2ef4d8c7a4131740/tests/data/test_template.py#L58
# # llama factory's template should match tokenizer's `apply_chat_template`.
item["format_str"] = [tokenizer.apply_chat_template(messages, tokenize=False)]
result.append(item)
return result
if __name__ == "__main__":
cfg = read_config(["orm_testmodel"])
orm_test_model = cfg["orm_testmodel"]
test_path = cfg["dataset"]["minimal_test_path"]
model_path = cfg["orm"][orm_test_model]["model_path"]
result_path = cfg["orm"][orm_test_model]["minimal_test_score_path"]
eval_result_path = cfg["orm"][orm_test_model]["eval_result_path"]
test_dataset = preprocess_dataset(model_path, load_jsonl(test_path))
results = [reward_model_inference(x) for x in tqdm(test_dataset)]
save_jsonl(results, result_path)
# results = load_jsonl(result_path)
groups = group_results(results, cfg["apps"])
eval_results = [score_pass_at_k(groups, k, orm_test_model) for k in range(1, 32)]
save_jsonl(eval_results, eval_result_path)
print(eval_results)
...@@ -56,7 +56,7 @@ def group_results(results, apps_path): ...@@ -56,7 +56,7 @@ def group_results(results, apps_path):
def pass_at_k(groups, k): def pass_at_k(groups, k):
result = {"strategy": f"pass@k={k}"} result = {"strategy": "pass@k", "k": k}
for difficulty, problems in groups.items(): for difficulty, problems in groups.items():
num_samples, num_correct = [], [] num_samples, num_correct = [], []
...@@ -73,12 +73,12 @@ def score_pass_at_k(groups, k, strategy): ...@@ -73,12 +73,12 @@ def score_pass_at_k(groups, k, strategy):
result = {"strategy": strategy, "k": k} result = {"strategy": strategy, "k": k}
for difficulty, problems in groups.items(): for difficulty, problems in groups.items():
num_samples, num_correct = [], [] num_samples, num_correct = 0, 0
for lst in problems.values(): for lst in problems.values():
num_samples.append(len(lst)) num_samples += 1
num_correct.append(sum(item["eval_result"] for item in lst[:k])) num_correct += any(item["eval_result"] for item in lst[:k])
pass_at_k = np.mean([c / n for c, n in zip(num_correct, num_samples)]) pass_at_k = num_correct / num_samples
result[difficulty] = pass_at_k result[difficulty] = pass_at_k
return result return result
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment