Commit 348877f0 by nzy

step1: sort and split dataset

parent c1707b7d
......@@ -11,4 +11,7 @@ temperature = 0.0
max_new_tokens = 512
[evaluate]
evaluate_result_path = ""
\ No newline at end of file
evaluate_result_path = ""
train_path = ""
test_path = ""
minimal_test_path = ""
\ No newline at end of file
......@@ -16,6 +16,13 @@ Our experimental results demonstrate that COVER significantly outperforms existi
### Step1 Sample & Evaluate
```bash
python step1_mk_prompt.py
python step1_sample_code.py
python step1_evaluate_code.py
python step1_sort_split_dataset.py
```
### Step2 Prepare preference code pairs
### Step3 Train ORM & Critic Model
......
from utils import load_jsonl, save_jsonl, read_config
def mk_key_for_sort(item):
problem_id = item['problem_id']
prefix, idx = problem_id.split('_')
prefix_weight = 0 if prefix == 'train' else 1
return (prefix_weight, int(idx))
def sort_and_drop_key(dataset, key):
dataset = sorted(dataset, key=lambda x: x[key])
for item in dataset:
item.pop(key)
return dataset
TEST_RANGES = [(0, 300), (3000, 3100), (4000, 4100)]
def is_in_test_range(prefix_weight, idx):
if prefix_weight == 1:
for start, end in TEST_RANGES:
if start <= idx < end:
return True
return False
def sort_and_split_dataset(raw_dataset_path, new_train_path, new_test_path, minimal_test_path, n):
"""
The dataset will be divided into two parts: Train and Test.
From the Test set, 10% of items across varying difficulties will be selected.
Among these, only those items for which the LLM can generate correct solutions will be included in the minimal testset.
This approach reduces the test time by approximately 1/5.
"""
dataset = load_jsonl(raw_dataset_path)
# add `key_for_sort`
new_train, new_test = [], []
for item in dataset:
item["key_for_sort"] = mk_key_for_sort(item)
if is_in_test_range(*item["key_for_sort"]):
new_test.append(item)
else:
new_train.append(item)
new_train = sort_and_drop_key(new_train, "key_for_sort")
new_test = sort_and_drop_key(new_test, "key_for_sort")
minimal_test = []
assert len(new_test) % n == 0
for i in range(len(new_test) // n):
problem = new_test[i * n : (i + 1) * n]
has_correct_solution = all(d["eval_result"] for d in problem)
if has_correct_solution:
minimal_test.extend(problem)
save_jsonl(new_train, new_train_path)
save_jsonl(new_test, new_test_path)
save_jsonl(minimal_test, minimal_test_path)
if __name__ == "__main__":
cfg = read_config
sort_and_split_dataset(
cfg["evaluate"]["evaluate_result_path"],
cfg["evaluate"]["train_path"],
cfg["evaluate"]["test_path"],
cfg["evaluate"]["minimal_test_path"],
cfg["sample"]["sampling_params"]["n"]
)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment