Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
codecritic
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Ziyuan Nan
codecritic
Commits
348877f0
Commit
348877f0
authored
Oct 02, 2024
by
nzy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
step1: sort and split dataset
parent
c1707b7d
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
82 additions
and
2 deletions
+82
-2
example_config.toml
+5
-2
readme.qmd
+7
-0
step1_sort_split_dataset.py
+70
-0
No files found.
example_config.toml
View file @
348877f0
...
...
@@ -11,4 +11,7 @@ temperature = 0.0
max_new_tokens
=
512
[evaluate]
evaluate_result_path
=
""
\ No newline at end of file
evaluate_result_path
=
""
train_path
=
""
test_path
=
""
minimal_test_path
=
""
\ No newline at end of file
readme.qmd
View file @
348877f0
...
...
@@ -16,6 +16,13 @@ Our experimental results demonstrate that COVER significantly outperforms existi
### Step1 Sample & Evaluate
```bash
python step1_mk_prompt.py
python step1_sample_code.py
python step1_evaluate_code.py
python step1_sort_split_dataset.py
```
### Step2 Prepare preference code pairs
### Step3 Train ORM & Critic Model
...
...
step1_sort_split_dataset.py
0 → 100644
View file @
348877f0
from
utils
import
load_jsonl
,
save_jsonl
,
read_config
def
mk_key_for_sort
(
item
):
problem_id
=
item
[
'problem_id'
]
prefix
,
idx
=
problem_id
.
split
(
'_'
)
prefix_weight
=
0
if
prefix
==
'train'
else
1
return
(
prefix_weight
,
int
(
idx
))
def
sort_and_drop_key
(
dataset
,
key
):
dataset
=
sorted
(
dataset
,
key
=
lambda
x
:
x
[
key
])
for
item
in
dataset
:
item
.
pop
(
key
)
return
dataset
TEST_RANGES
=
[(
0
,
300
),
(
3000
,
3100
),
(
4000
,
4100
)]
def
is_in_test_range
(
prefix_weight
,
idx
):
if
prefix_weight
==
1
:
for
start
,
end
in
TEST_RANGES
:
if
start
<=
idx
<
end
:
return
True
return
False
def
sort_and_split_dataset
(
raw_dataset_path
,
new_train_path
,
new_test_path
,
minimal_test_path
,
n
):
"""
The dataset will be divided into two parts: Train and Test.
From the Test set, 10
%
of items across varying difficulties will be selected.
Among these, only those items for which the LLM can generate correct solutions will be included in the minimal testset.
This approach reduces the test time by approximately 1/5.
"""
dataset
=
load_jsonl
(
raw_dataset_path
)
# add `key_for_sort`
new_train
,
new_test
=
[],
[]
for
item
in
dataset
:
item
[
"key_for_sort"
]
=
mk_key_for_sort
(
item
)
if
is_in_test_range
(
*
item
[
"key_for_sort"
]):
new_test
.
append
(
item
)
else
:
new_train
.
append
(
item
)
new_train
=
sort_and_drop_key
(
new_train
,
"key_for_sort"
)
new_test
=
sort_and_drop_key
(
new_test
,
"key_for_sort"
)
minimal_test
=
[]
assert
len
(
new_test
)
%
n
==
0
for
i
in
range
(
len
(
new_test
)
//
n
):
problem
=
new_test
[
i
*
n
:
(
i
+
1
)
*
n
]
has_correct_solution
=
all
(
d
[
"eval_result"
]
for
d
in
problem
)
if
has_correct_solution
:
minimal_test
.
extend
(
problem
)
save_jsonl
(
new_train
,
new_train_path
)
save_jsonl
(
new_test
,
new_test_path
)
save_jsonl
(
minimal_test
,
minimal_test_path
)
if
__name__
==
"__main__"
:
cfg
=
read_config
sort_and_split_dataset
(
cfg
[
"evaluate"
][
"evaluate_result_path"
],
cfg
[
"evaluate"
][
"train_path"
],
cfg
[
"evaluate"
][
"test_path"
],
cfg
[
"evaluate"
][
"minimal_test_path"
],
cfg
[
"sample"
][
"sampling_params"
][
"n"
]
)
\ No newline at end of file
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment