Commit 083ad155 by nzy

utils: add metrics

parent d0f8150e
......@@ -18,6 +18,11 @@ Our experimental results demonstrate that COVER significantly outperforms existi
### Step2 Prepare preference code pairs
### Step3 Train ORM & Critic Model
### Step4 Evaluate ORM & Critic Model
## Environment
Same as Llama-factory (Recommand Version)
......
import numpy as np
from datasets import load_dataset
from collections import defaultdict
def estimate_pass_at_k(
num_samples: list[int], num_correct: list[int], k: int
) -> np.ndarray:
"""
Estimates pass@k of each problem and returns them in an array.
"""
def estimator(n: int, c: int, k: int) -> float:
"""
Calculates 1 - comb(n - c, k) / comb(n, k).
"""
if n - c < k:
return 1.0
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
return np.array(
[estimator(int(n), int(c), k) for n, c in zip(num_samples, num_correct)]
)
def group_results(results, apps_path):
"""
Output
{
"interview": [
problem_id: [
{"problem_id", problem_id, "eval_result": True, ... },
...
],
...
],
...
}
"""
dataset = load_dataset(apps_path)
groups = defaultdict(lambda: defaultdict(list))
for item in results:
problem_id = item["problem_id"]
split, idx = problem_id.split("_")
difficulty = dataset[split][int(idx)]["difficulty"]
groups[difficulty][problem_id].append(item)
return groups
def pass_at_k(groups, k):
result = {"strategy": f"pass@k={k}"}
for difficulty, problems in groups.items():
num_samples, num_correct = [], []
for lst in problems.values():
num_samples.append(len(lst))
num_correct.append(sum(item["eval_result"] for item in lst))
pass_at_k = np.mean(estimate_pass_at_k(num_samples, num_correct, k))
result[difficulty] = pass_at_k
return result
def score_pass_at_k(groups, k, strategy):
result = {"strategy": f"{strategy} * pass@k={k}"}
for difficulty, problems in groups.items():
num_samples, num_correct = [], []
for lst in problems.values():
# select topk
sorted_lst = sorted(lst, key=lambda x: x["score"], reverse=True)[:k]
num_samples.append(len(lst))
num_correct.append(sum(item["eval_result"] for item in sorted_lst))
pass_at_k = np.mean([c / n for c, n in zip(num_correct, num_samples)])
result[difficulty] = pass_at_k
return result
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment