step1: sort and split dataset

348877f0 · nzy · c1707b7d · 348877f0 · 348877f0 · 348877f0
Commit 348877f0 authored Oct 02, 2024 by nzy
Hide whitespace changes
Inline Side-by-side

Showing with 82 additions and 2 deletions

example_config.toml
+5 -2

readme.qmd
+7 -0

step1_sort_split_dataset.py
+70 -0

No files found.
--- a/example_config.toml
+++ b/example_config.toml
@@ -11,4 +11,7 @@ temperature = 0.0
 max_new_tokens = 512

 [evaluate]
-evaluate_result_path = ""
\ No newline at end of file
+evaluate_result_path = ""
+train_path = ""
+test_path = ""
+minimal_test_path = ""
\ No newline at end of file
--- a/readme.qmd
+++ b/readme.qmd
@@ -16,6 +16,13 @@ Our experimental results demonstrate that COVER significantly outperforms existi

 ### Step1 Sample & Evaluate

+```bash
+python step1_mk_prompt.py
+python step1_sample_code.py
+python step1_evaluate_code.py
+python step1_sort_split_dataset.py
+```
+
 ### Step2 Prepare preference code pairs

 ### Step3 Train ORM & Critic Model

--- a/step1_sort_split_dataset.py
+++ b/step1_sort_split_dataset.py
+from utils import load_jsonl, save_jsonl, read_config
+
+
+def mk_key_for_sort(item):
+    problem_id = item['problem_id']
+    prefix, idx = problem_id.split('_')
+    prefix_weight = 0 if prefix == 'train' else 1
+    return (prefix_weight, int(idx))
+
+
+def sort_and_drop_key(dataset, key):
+    dataset = sorted(dataset, key=lambda x: x[key])
+    for item in dataset:
+        item.pop(key)
+    return dataset
+
+
+TEST_RANGES = [(0, 300), (3000, 3100), (4000, 4100)]
+def is_in_test_range(prefix_weight, idx):
+    if prefix_weight == 1:
+        for start, end in TEST_RANGES:
+            if start <= idx < end:
+                return True
+    return False
+
+
+def sort_and_split_dataset(raw_dataset_path, new_train_path, new_test_path, minimal_test_path, n):
+    """
+    The dataset will be divided into two parts: Train and Test.
+    From the Test set, 10% of items across varying difficulties will be selected.
+    Among these, only those items for which the LLM can generate correct solutions will be included in the minimal testset.
+    This approach reduces the test time by approximately 1/5.
+    """
+    dataset = load_jsonl(raw_dataset_path)
+    
+    # add `key_for_sort`
+    new_train, new_test = [], []
+    for item in dataset:
+        item["key_for_sort"] = mk_key_for_sort(item)
+        if is_in_test_range(*item["key_for_sort"]):
+            new_test.append(item)
+        else:
+            new_train.append(item)
+    
+    new_train = sort_and_drop_key(new_train, "key_for_sort")
+    new_test = sort_and_drop_key(new_test, "key_for_sort")
+    
+    minimal_test = []
+    assert len(new_test) % n == 0
+    for i in range(len(new_test) // n):
+        problem = new_test[i * n : (i + 1) * n]
+        has_correct_solution = all(d["eval_result"] for d in problem)
+        if has_correct_solution:
+            minimal_test.extend(problem)
+
+    save_jsonl(new_train, new_train_path)
+    save_jsonl(new_test, new_test_path)
+    save_jsonl(minimal_test, minimal_test_path)
+
+
+if __name__ == "__main__":
+    cfg = read_config
+    sort_and_split_dataset(
+        cfg["evaluate"]["evaluate_result_path"],
+        cfg["evaluate"]["train_path"],
+        cfg["evaluate"]["test_path"],
+        cfg["evaluate"]["minimal_test_path"],
+        cfg["sample"]["sampling_params"]["n"]
+    )
\ No newline at end of file