step4: test_reward_model Replace multi-threaded request handling with single-threaded approach

- Addresses occasional deadlocks in multi-threaded model server requests. - Implemented a single-threaded, single-server setup for testing purposes. - Observed that the Reward model's performance is acceptable and can be used. - Temporarily adopting this single-threaded version

step4: test_reward_model Replace multi-threaded request handling with single-threaded approach
- Addresses occasional deadlocks in multi-threaded model server requests. - Implemented a single-threaded, single-server setup for testing purposes. - Observed that the Reward model's performance is acceptable and can be used. - Temporarily adopting this single-threaded version
c72d23fc · nanziyuan · 52fa8d54 · c72d23fc · c72d23fc
Commit c72d23fc authored Oct 10, 2024 by nanziyuan
Show whitespace changes
Inline Side-by-side

Showing with 66 additions and 5 deletions

step4_test_reward_model_test.py
+61 -0

utils_metric.py
+5 -5

No files found.
--- a/step4_test_reward_model_test.py
+++ b/step4_test_reward_model_test.py
+from tqdm import tqdm
+import requests
+import json
+from utils import load_jsonl, extract_code, save_jsonl, read_config
+from utils_metric import group_results, score_pass_at_k
+from transformers import AutoTokenizer
+def get_rewards_from_server(server_url: str, messages: list[str]):
+    """
+    Gets reward scores from the API server.
+    """
+    headers = {"Content-Type": "application/json"}
+    payload = {"model": "model", "messages": messages}
+    response = requests.post(server_url, json=payload, headers=headers)
+    rewards = json.loads(response.text)["scores"]
+    return rewards
+def reward_model_inference(item):
+    server_url = f"http://0.0.0.0:8000/v1/score/evaluation"
+    score = get_rewards_from_server(server_url, item["format_str"])[0]
+    return {
+        "problem_id": item["problem_id"],
+        "messages": item["messages"],
+        "eval_result": item["eval_result"],
+        "score": score,
+    }
+def preprocess_dataset(model_path, test_dataset):
+    "apply chat_template and split the dataset to different gpu"
+    tokenizer = AutoTokenizer.from_pretrained(model_path)
+    result = []
+    for i, item in enumerate(test_dataset):
+        messages = item["messages"]
+        messages[-1]["content"] = extract_code(messages[-1]["content"])
+        # https://github.com/hiyouga/LLaMA-Factory/blob/a45f3f5461e2936b9e119eda2ef4d8c7a4131740/tests/data/test_template.py#L58
+        # # llama factory's template should match tokenizer's `apply_chat_template`.
+        item["format_str"] = [tokenizer.apply_chat_template(messages, tokenize=False)]
+        result.append(item)
+    return result
+if __name__ == "__main__":
+    cfg = read_config(["orm_testmodel"])
+    orm_test_model = cfg["orm_testmodel"]
+    test_path = cfg["dataset"]["minimal_test_path"]
+    model_path = cfg["orm"][orm_test_model]["model_path"]
+    result_path = cfg["orm"][orm_test_model]["minimal_test_score_path"]
+    eval_result_path = cfg["orm"][orm_test_model]["eval_result_path"]
+    test_dataset = preprocess_dataset(model_path, load_jsonl(test_path))
+    results = [reward_model_inference(x) for x in tqdm(test_dataset)]
+    save_jsonl(results, result_path)
+    # results = load_jsonl(result_path)
+    groups = group_results(results, cfg["apps"])
+    eval_results = [score_pass_at_k(groups, k, orm_test_model) for k in range(1, 32)]
+    save_jsonl(eval_results, eval_result_path)
+    print(eval_results)
--- a/utils_metric.py
+++ b/utils_metric.py
@@ -56,7 +56,7 @@ def group_results(results, apps_path):
 def pass_at_k(groups, k):
-    result = {"strategy": f"pass@k={k}"}
+    result = {"strategy": "pass@k", "k": k}
    for difficulty, problems in groups.items():
        num_samples, num_correct = [], []
@@ -73,12 +73,12 @@ def score_pass_at_k(groups, k, strategy):
    result = {"strategy": strategy, "k": k}
    for difficulty, problems in groups.items():
-        num_samples, num_correct = [], []
+        num_samples, num_correct = 0, 0
        for lst in problems.values():
-            num_samples.append(len(lst))
+            num_samples += 1
-            num_correct.append(sum(item["eval_result"] for item in lst[:k]))
+            num_correct += any(item["eval_result"] for item in lst[:k])
-        pass_at_k = np.mean([c / n for c, n in zip(num_correct, num_samples)])
+        pass_at_k = num_correct / num_samples
        result[difficulty] = pass_at_k
    return result