refact evaluation. This commit is just about test_genrm and test_orm. Others cannot work.

ddc93279 · nzy · 4e3e7502 · 4e3e7502 · ddc93279 · ddc93279
Commit ddc93279 authored Dec 27, 2024 by nzy
10 changed files
--- a/codecritic/cli/run_sft_test.py
+++ b/codecritic/cli/run_sft_test.py
-import argparse
-import os
-from pathlib import Path
-import pprint
-from codecritic.data.utils import mk_message
-from codecritic.data.verify import JUDGE_PROMPT
-from transformers import AutoTokenizer
-from codecritic.data.code import extract_code, code_template
-from codecritic.data.cov import COV_PROMPT
-from codecritic.data.verify import get_score_token_id
-from codecritic.utils.vllm import vllm_chatcomplete, vllm_score
-from codecritic.utils.json import load_jsonl, save_jsonl
-from codecritic.utils.metric import group_results, score_pass_at_k
-def append_prompt(item, content):
-    item["messages"].append({"role": "user", "content": content})
-    return item
-def run_sft_model(model_path, test_path, apps_path, reason_prompt, model_gpu):
-    home_path = Path(model_path).parent
-    result_dir = home_path / "eval"
-    result_dir.mkdir(exist_ok=True)
-    # preprocess prompt
-    test_dataset = load_jsonl(test_path)
-    # reason
-    if reason_prompt:
-        test_dataset = [append_prompt(x, COV_PROMPT) for x in test_dataset]
-        sampling_params = dict(n=1, temperature=0.0, max_tokens=2048)
-        test_dataset = vllm_chatcomplete(model_path, test_dataset, sampling_params, model_gpu)
-    # score
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
-    score_token = get_score_token_id(tokenizer)
-    test_dataset = [append_prompt(x, JUDGE_PROMPT) for x in test_dataset]
-    results = vllm_score(model_path, test_dataset, score_token, model_gpu)
-    score_path = result_dir / "scores.jsonl"
-    save_jsonl(results, score_path)
-    # compute pass@k
-    eval_result_path = result_dir / "passk.jsonl"
-    # results = load_jsonl(score_path)
-    groups = group_results(results, apps_path)
-    eval_results = [score_pass_at_k(groups, k, home_path.stem) for k in range(1, 16)]
-    save_jsonl(eval_results, eval_result_path)
-    pprint.pp(eval_results)
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str)
-    parser.add_argument("--test", type=str)
-    parser.add_argument("--apps", type=str)
-    parser.add_argument("--reason", choices=["cov"])
-    parser.add_argument("--gpu", type=int, default=1, help="gpu number required by model")
-    args = parser.parse_args()
-    os.environ["TOKENIZERS_PARALLELISM"] = "false"
-    reason_prompts = {"cov": COV_PROMPT}
-    reason_prompt = reason_prompts.get(args.reason, None)
-    run_sft_model(args.model, args.test, args.apps, reason_prompt, args.gpu)
--- a/codecritic/cli/test_genrm.py
+++ b/codecritic/cli/test_genrm.py
+import argparse
+from functools import partial
+import os
+from transformers import AutoTokenizer
+from codecritic.data.genrm_prompt import JUDGE_PROMPT
+from codecritic.evaluation.inference import generate_worker, score_worker
+from codecritic.utils.parallel import model_map
+from codecritic.utils.json import load_jsonl, save_jsonl
+def append_prompt(item, content):
+    item["messages"].append({"role": "user", "content": content})
+    return item
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str, help="path/to/model")
+    parser.add_argument("--sample", type=str, help="path/to/sample")
+    parser.add_argument("--output", type=str, help="path/to/score")
+    parser.add_argument("--reasoning", action="store_true", help="enable reasoning")
+    parser.add_argument(
+        "--gpu", type=int, default=1, help="gpu number required by model"
+    )
+    args = parser.parse_args()
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    dataset = load_jsonl(args.sample)
+    if args.reasoning:
+        dataset = [append_prompt(x, COV_PROMPT) for x in dataset]
+        worker = partial(
+            generate_worker,
+            model_path=args.model,
+            n=1,
+            temperature=0,
+            max_tokens=4096,
+        )
+        dataset = model_map(worker, dataset, args.gpu_per_model)
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    def get_token_id(token):
+        score_tokens = tokenizer.encode(token, add_special_tokens=False)
+        assert len(score_tokens) == 1
+        return score_tokens[0]
+    positive_token = get_token_id("Yes")
+    negative_token = get_token_id("No")
+    dataset = [append_prompt(x, JUDGE_PROMPT) for x in dataset]
+    worker = partial(
+        score_worker,
+        model_path=args.model,
+        positive_token=positive_token,
+        negative_token=negative_token
+    )
+    dataset = model_map(worker, dataset, args.gpu_per_model)
+    save_jsonl(dataset, args.output)
--- a/codecritic/cli/run_rm_test.py
+++ b/codecritic/cli/run_rm_test.py
 import argparse
 import json
-from tqdm import tqdm
 import requests
+from tqdm import tqdm
 from transformers import AutoTokenizer
-import pprint
-from pathlib import Path
-from codecritic.data.code import code_template, extract_code
 from codecritic.utils.json import load_jsonl, save_jsonl
-from codecritic.utils.metric import group_results, score_pass_at_k
 def get_rewards_from_server(server_url: str, messages: list[str]):
@@ -22,41 +18,21 @@ def get_rewards_from_server(server_url: str, messages: list[str]):
    return rewards
-def test_reward_model(server_url, item, tokenizer):
-    query = tokenizer.apply_chat_template(item["messages"], tokenize=False)
-    score = get_rewards_from_server(server_url, [query])[0]
-    return {
-        "problem_id": item["problem_id"],
-        "messages": item["messages"],
-        "eval_result": item["eval_result"],
-        "score": score,
-    }
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str)
+    parser.add_argument("--model", type=str, help="path/to/model")
-    parser.add_argument("--test", type=str)
+    parser.add_argument("--sample", type=str, help="path/to/sample")
-    parser.add_argument("--apps", type=str)
+    parser.add_argument("--output", type=str, help="path/to/score")
    args = parser.parse_args()
-    home_path = Path(args.model).parent
-    result_dir = home_path / "eval"
-    result_dir.mkdir(exist_ok=True)
    # compute score
-    test_dataset = load_jsonl(args.test)
+    dataset = load_jsonl(args.sample)
    server_url = "http://0.0.0.0:5000/get_reward"
    tokenizer = AutoTokenizer.from_pretrained(args.model)
-    results = [test_reward_model(server_url, item, tokenizer) for item in tqdm(test_dataset)]
-    score_path = result_dir / "scores.jsonl"
+    for item in tqdm(dataset):
-    save_jsonl(results, score_path)
+        query = tokenizer.apply_chat_template(item["messages"], tokenize=False)
+        score = get_rewards_from_server(server_url, [query])[0]
+        item["score"] = score
-    # compute pass@k
+    save_jsonl(dataset, args.output)
-    results = load_jsonl(score_path)
-    groups = group_results(results, args.apps)
-    eval_results = [score_pass_at_k(groups, k, home_path.stem) for k in range(1, 16)]
-    eval_result_path = result_dir / "passk.jsonl"
-    save_jsonl(eval_results, eval_result_path)
-    pprint.pp(eval_results)
--- a/codecritic/data/verify.py
+++ b/codecritic/data/verify.py
@@ -11,7 +11,4 @@ def mk_critic_verify(answer=None):
    return message
-def get_score_token_id(tokenizer, token_str="Yes"):
-    score_tokens = tokenizer.encode(token_str, add_special_tokens=False)
-    assert len(score_tokens) == 1
-    return score_tokens[0]
--- a/codecritic/data/utils.py
+++ b/codecritic/data/utils.py
-# Note that the human and observation should appear in odd positions
-# while llm should appear in even positions.
-from codecritic.utils.json import save_jsonl
-from pathlib import Path
-def mk_message(user, assistant):
-    return [
-        {"role": "user", "content": user},
-        {"role": "assistant", "content": assistant},
-    ]
-# TODO This function can be removed
-def save_jsonl_dataset(dataset, output_dir, split="train"):
-    output_dir = Path(output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-    save_jsonl(dataset, output_dir / f"{split}.jsonl")
-SPLITTER = "__I_wish_it_were_weekends_all_the_time.__"
--- a/codecritic/evaluation/llm_inference.py
+++ b/codecritic/evaluation/llm_inference.py
-from functools import partial
 import os
+import numpy as np
 from vllm import LLM, SamplingParams
-from codecritic.data.utils import SPLITTER
+SPLITTER = "__I_wish_it_were_weekends_all_the_time.__"
 def generate_worker(cuda_device, prompts, model_path, sampling_params):
-    os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(cuda_device)
+    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(cuda_device)
-    llm = LLM(model=model_path,
+    llm = LLM(
-              seed=42,
+        model=model_path,
-              max_model_len=8 * 1024,
+        seed=42,
-              swap_space=16,
+        max_model_len=8 * 1024,
-              tensor_parallel_size=len(cuda_device))
+        swap_space=16,
+        tensor_parallel_size=len(cuda_device),
+    )
    tokenizer = llm.get_tokenizer()
    stop_tokens = [tokenizer.eos_token_id]
    print(f"SUCCESS: load llm {model_path} on cuda {cuda_device}")
    vllm_sampling_params = SamplingParams(
-        n=sampling_params['n'],
+        n=sampling_params["n"],
-        temperature=sampling_params['temperature'],
+        temperature=sampling_params["temperature"],
        top_p=0.95,
-        max_tokens=sampling_params['max_tokens'],
+        max_tokens=sampling_params["max_tokens"],
-        stop_token_ids=stop_tokens
+        stop_token_ids=stop_tokens,
    )
    print("Sampling params:", vllm_sampling_params)
    def messages_to_text(messages):
-        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+        text = tokenizer.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
        if SPLITTER in text:
            text = text.split(SPLITTER)[0]
        return text
    text_prompts = [messages_to_text(item["messages"]) for item in prompts]
-    outputs = llm.generate(text_prompts, sampling_params=vllm_sampling_params, use_tqdm=True)
+    outputs = llm.generate(
+        text_prompts, sampling_params=vllm_sampling_params, use_tqdm=True
+    )
    results = []
    for item, output in zip(prompts, outputs):
@@ -58,16 +64,14 @@ def generate_worker(cuda_device, prompts, model_path, sampling_params):
    return results
-def score_worker(cuda_device, prompts, model_path, score_token):
+def score_worker(cuda_device, prompts, model_path, positive_token, negative_token=None):
-    def compute_score_onetoken(logprob):
+    def only_positive(logprob):
-        positive_token = score_token[0]
        positive_logprob = logprob.get(positive_token)
        positive_prob = np.exp(positive_logprob.logprob) if positive_logprob else 0
        return {"score": positive_prob}
-    def compute_score_twotoken(logprob):
+    def pos_and_neg(logprob):
-        positive_token, negative_token = score_token[0], score_token[1]
        positive_logprob = logprob.get(positive_token)
        positive_prob = np.exp(positive_logprob.logprob) if positive_logprob else 0
@@ -76,40 +80,40 @@ def score_worker(cuda_device, prompts, model_path, score_token):
        return {
            "score": positive_prob / (positive_prob + negative_prob),
-            "uncertainty": 1 - (positive_prob + negative_prob)
+            "uncertainty": 1 - (positive_prob + negative_prob),
        }
-    if len(score_token) == 1:
+    compute_score = only_positive if negative_token is None else pos_and_neg
-        compute_score = compute_score_onetoken
-    elif len(score_token) == 2:
-        compute_score = compute_score_twotoken
-    else:
-        raise NotImplementedError("param: score_token length shoud be 1 or 2")
-    os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(cuda_device)
+    os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(cuda_device)
-    llm = LLM(model=model_path,
+    llm = LLM(
-              seed=42,
+        model=model_path,
-              max_model_len=8 * 1024,
+        seed=42,
-              swap_space=16,
+        max_model_len=8 * 1024,
-              tensor_parallel_size=len(cuda_device))
+        swap_space=16,
+        tensor_parallel_size=len(cuda_device),
+    )
    tokenizer = llm.get_tokenizer()
    print(f"SUCCESS: load llm {model_path} on cuda {cuda_device}")
-    vllm_sampling_params = SamplingParams(
+    vllm_sampling_params = SamplingParams(n=1, temperature=0, max_tokens=5, logprobs=20)
-        n=1,
-        temperature=0,
-        max_tokens=5,
-        logprobs=20
-    )
-    text_prompts = [tokenizer.apply_chat_template(item["messages"], tokenize=False, add_generation_prompt=True) for item in prompts]
+    text_prompts = [
+        tokenizer.apply_chat_template(
+            item["messages"], tokenize=False, add_generation_prompt=True
+        )
+        for item in prompts
+    ]
-    outputs = llm.generate(text_prompts, sampling_params=vllm_sampling_params, use_tqdm=False)
+    outputs = llm.generate(
+        text_prompts, sampling_params=vllm_sampling_params, use_tqdm=False
+    )
    results = []
    for item, output in zip(prompts, outputs):
+        assert len(output.outputs) == 1, "The scorer must provide a single score."
        for response in output.outputs:
            # response.logprobs: list[dict[int, Logprob]] https://github.com/vllm-project/vllm/blob/main/vllm/sequence.py
            scores = compute_score(response.logprobs[0])

--- a/codecritic/utils/json.py
+++ b/codecritic/utils/json.py
 import json
 from pathlib import Path
 def ensure_parent(path):
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)

--- a/codecritic/utils/metric.py
+++ b/codecritic/utils/metric.py
-import numpy as np
-from datasets import load_dataset
-from collections import defaultdict
-def estimate_pass_at_k(
-    num_samples: list[int], num_correct: list[int], k: int
-) -> np.ndarray:
-    """
-    Estimates pass@k of each problem and returns them in an array.
-    """
-    def estimator(n: int, c: int, k: int) -> float:
-        """
-        Calculates 1 - comb(n - c, k) / comb(n, k).
-        """
-        if n - c < k:
-            return 1.0
-        return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
-    return np.array(
-        [estimator(int(n), int(c), k) for n, c in zip(num_samples, num_correct)]
-    )
-def group_results(results, apps_path):
-    """
-    Output
-    {
-        "interview": [
-            problem_id: [
-                {"problem_id", problem_id, "eval_result": True, ... },
-                ...
-            ],
-            ...
-        ],
-        ...
-    }
-    """
-    dataset = load_dataset(apps_path)
-    groups = defaultdict(lambda: defaultdict(list))
-    for item in results:
-        problem_id = item["problem_id"]
-        split, idx = problem_id.split("_")
-        difficulty = dataset[split][int(idx)]["difficulty"]
-        groups[difficulty][problem_id].append(item)
-    if "score" in results[0]:
-        for difficulty, problem in groups.items():
-            for problem_id, lst in problem.items():
-                sorted_lst = sorted(lst, key=lambda x: x["score"], reverse=True)
-                problem[problem_id] = sorted_lst
-    return groups
-def pass_at_k(groups, k):
-    result = {"strategy": "pass@k", "k": k}
-    for difficulty, problems in groups.items():
-        num_samples, num_correct = [], []
-        for lst in problems.values():
-            num_samples.append(len(lst))
-            num_correct.append(sum(item["eval_result"] for item in lst))
-        pass_at_k = np.mean(estimate_pass_at_k(num_samples, num_correct, k))
-        result[difficulty] = pass_at_k
-    return result
-def score_pass_at_k(groups, k, strategy):
-    result = {"strategy": strategy, "k": k}
-    for difficulty, problems in groups.items():
-        num_samples, num_correct = 0, 0
-        for lst in problems.values():
-            num_samples += 1
-            num_correct += any(item["eval_result"] for item in lst[:k])
-        pass_at_k = num_correct / num_samples
-        result[difficulty] = pass_at_k
-    return result
--- a/codecritic/utils/vllm.py
+++ b/codecritic/utils/vllm.py
--- a/codecritic/cli/print_chat.py
+++ b/codecritic/cli/print_chat.py