utils: add metrics

083ad155 · nzy · d0f8150e · 083ad155 · 083ad155
Commit 083ad155 authored Sep 27, 2024 by nzy
Hide whitespace changes
Inline Side-by-side

Showing with 85 additions and 0 deletions

readme.qmd
+5 -0

utils_metric.py
+80 -0

No files found.
--- a/readme.qmd
+++ b/readme.qmd
@@ -18,6 +18,11 @@ Our experimental results demonstrate that COVER significantly outperforms existi
 ### Step2 Prepare preference code pairs
+### Step3 Train ORM & Critic Model
+### Step4 Evaluate ORM & Critic Model
 ## Environment
 Same as Llama-factory (Recommand Version)

--- a/utils_metric.py
+++ b/utils_metric.py
+import numpy as np
+from datasets import load_dataset
+from collections import defaultdict
+def estimate_pass_at_k(
+    num_samples: list[int], num_correct: list[int], k: int
+) -> np.ndarray:
+    """
+    Estimates pass@k of each problem and returns them in an array.
+    """
+    def estimator(n: int, c: int, k: int) -> float:
+        """
+        Calculates 1 - comb(n - c, k) / comb(n, k).
+        """
+        if n - c < k:
+            return 1.0
+        return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
+    return np.array(
+        [estimator(int(n), int(c), k) for n, c in zip(num_samples, num_correct)]
+    )
+def group_results(results, apps_path):
+    """
+    Output
+    {
+        "interview": [
+            problem_id: [
+                {"problem_id", problem_id, "eval_result": True, ... },
+                ...
+            ],
+            ...
+        ],
+        ...
+    }
+    """
+    dataset = load_dataset(apps_path)
+    groups = defaultdict(lambda: defaultdict(list))
+    for item in results:
+        problem_id = item["problem_id"]
+        split, idx = problem_id.split("_")
+        difficulty = dataset[split][int(idx)]["difficulty"]
+        groups[difficulty][problem_id].append(item)
+    return groups
+def pass_at_k(groups, k):
+    result = {"strategy": f"pass@k={k}"}
+    for difficulty, problems in groups.items():
+        num_samples, num_correct = [], []
+        for lst in problems.values():
+            num_samples.append(len(lst))
+            num_correct.append(sum(item["eval_result"] for item in lst))
+        pass_at_k = np.mean(estimate_pass_at_k(num_samples, num_correct, k))
+        result[difficulty] = pass_at_k
+    return result
+def score_pass_at_k(groups, k, strategy):
+    result = {"strategy": f"{strategy} * pass@k={k}"}
+    for difficulty, problems in groups.items():
+        num_samples, num_correct = [], []
+        for lst in problems.values():
+            # select topk
+            sorted_lst = sorted(lst, key=lambda x: x["score"], reverse=True)[:k]
+            num_samples.append(len(lst))
+            num_correct.append(sum(item["eval_result"] for item in sorted_lst))
+        pass_at_k = np.mean([c / n for c, n in zip(num_correct, num_samples)])
+        result[difficulty] = pass_at_k
+    return result