Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
codecritic
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Ziyuan Nan
codecritic
Commits
d11a2acf
Commit
d11a2acf
authored
Dec 28, 2024
by
nzy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
new apps sample
parent
94ea5cb1
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
109 additions
and
40 deletions
+109
-40
README.md
+1
-0
codecritic/cli/gen_dataset.py
+104
-0
codecritic/cli/mk_rm_dataset.py
+0
-33
codecritic/cli/test_genrm.py
+2
-2
codecritic/evaluation/apps_eval.py
+2
-5
No files found.
README.md
View file @
d11a2acf
...
...
@@ -22,6 +22,7 @@
"pass"
:
"boolean, indicates whether the solution passed the task"
,
"skip"
:
"boolean, set to True if no solution passes this task"
,
"messages"
:
"list of dictionaries, conversation messages in OpenAI format"
,
"code"
:
"clean code"
,
"positive_score"
:
"float, probability of the 'Yes' token"
,
"negative_score"
:
"float, probability of the 'No' token"
,
"meta_***"
:
"any additional data or custom fields"
,
...
...
codecritic/cli/gen_dataset.py
0 → 100644
View file @
d11a2acf
import
argparse
import
json
from
functools
import
partial
from
collections
import
defaultdict
from
datasets
import
load_dataset
from
vllm
import
SamplingParams
from
transformers
import
AutoTokenizer
from
codecritic.dataset.apps
import
mk_prompt
from
codecritic.dataset.code
import
extract_code
from
codecritic.evaluation.apps_eval
import
evaluate
from
codecritic.utils.inference
import
generate_worker
from
codecritic.utils.parallel
import
model_map
from
codecritic.utils.json
import
save_jsonl
def
transform_to_prompt
(
apps
,
tokenizer
):
prompts
=
[]
for
split
in
[
"train"
,
"test"
]:
dataset
=
apps
[
split
]
for
item
in
dataset
:
task_id
=
split
+
"-"
+
str
(
item
[
"id"
])
try
:
json
.
loads
(
item
[
"input_output"
])
except
ValueError
:
print
(
f
"Skipping {task_id}: Invalid JSON in input_output"
)
continue
prompt
=
mk_prompt
(
item
)
# Filter long prompts
tokenized_question
=
tokenizer
.
apply_chat_template
(
prompt
,
tokenize
=
True
)
length
=
len
(
tokenized_question
)
if
length
>
2048
:
print
(
f
"Skipping {task_id}: Token length {length} exceeds limit"
)
continue
prompts
.
append
(
{
"dataset"
:
"apps-"
+
item
[
"difficulty"
],
"task_id"
:
"task_id"
,
"messages"
:
prompt
,
}
)
return
prompts
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--model"
,
type
=
str
,
help
=
"path/to/model"
)
parser
.
add_argument
(
"--apps"
,
type
=
str
,
help
=
"path/to/apps"
)
parser
.
add_argument
(
"--train"
,
type
=
str
,
help
=
"path/to/train"
)
parser
.
add_argument
(
"--test"
,
type
=
str
,
help
=
"path/to/test"
)
parser
.
add_argument
(
"--gpu"
,
type
=
int
,
default
=
1
,
help
=
"gpu number required by one model"
)
args
=
parser
.
parse_args
()
apps
=
load_dataset
(
args
.
apps
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
model
)
dataset
=
transform_to_prompt
(
apps
,
tokenizer
)
# sampling
sampling_params
=
SamplingParams
(
n
=
50
,
temperature
=
0.8
,
top_p
=
0.95
,
max_tokens
=
2048
,
)
worker
=
partial
(
generate_worker
,
model_path
=
args
.
model
,
sampling_params
=
sampling_params
)
dataset
=
model_map
(
worker
,
dataset
,
args
.
gpu
)
# postprocess
grouped
=
defaultdict
(
list
)
for
sample
in
dataset
:
grouped
[
sample
[
"task_id"
]]
=
sample
def
is_in_test
(
task_id
):
split
,
idx
=
task_id
.
split
(
"-"
)
if
split
==
"test"
:
for
start
,
end
in
[(
0
,
300
),
(
3000
,
3100
),
(
4000
,
4100
)]:
if
start
<=
idx
<
end
:
return
True
return
False
trainset
,
testset
=
[],
[]
for
task_id
,
group
in
grouped
.
items
():
target
=
testset
if
is_in_test
(
task_id
)
else
trainset
for
idx
,
sample
in
enumerate
(
group
):
sample
[
"solution_id"
]
=
idx
sample
[
"code"
]
=
extract_code
(
sample
[
"messages"
][
-
1
][
"content"
])
target
.
append
(
sample
)
trainset
=
evaluate
(
trainset
,
apps
)
testset
=
evaluate
(
testset
,
apps
)
save_jsonl
(
trainset
,
args
.
train
)
save_jsonl
(
testset
,
args
.
test
)
codecritic/cli/mk_rm_dataset.py
deleted
100644 → 0
View file @
94ea5cb1
import
argparse
from
pathlib
import
Path
from
codecritic.utils.json
import
load_json
from
codecritic.dataset.utils
import
save_jsonl_dataset
from
codecritic.dataset.edit_distance
import
(
mk_problem_groups
,
calculate_edit_distances
,
mk_edit_distance_dataset
,
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--dataset_dir"
,
type
=
str
)
parser
.
add_argument
(
"--output_dir"
,
type
=
str
)
parser
.
add_argument
(
"--is_max"
,
type
=
bool
,
required
=
True
)
args
=
parser
.
parse_args
()
dataset_dir
=
Path
(
args
.
dataset_dir
)
train_path
=
dataset_dir
/
"train.jsonl"
sampling_params
=
load_json
(
dataset_dir
/
"sampling_params.json"
)
problems
=
mk_problem_groups
(
train_path
,
sampling_params
[
"n"
])
all_edit_distance_pairs
=
calculate_edit_distances
(
problems
)
postfix
=
"max"
if
args
.
is_max
else
"min"
dataset_name
=
f
"apps_edit_distance_{postfix}"
preference_pairs
,
metadata
=
mk_edit_distance_dataset
(
all_edit_distance_pairs
,
10
*
1000
,
5
,
is_max
=
args
.
is_max
)
save_jsonl_dataset
(
preference_pairs
,
args
.
output_dir
)
codecritic/cli/test_genrm.py
View file @
d11a2acf
...
...
@@ -24,7 +24,7 @@ if __name__ == "__main__":
help
=
"maximum number of tokens allowed for the reasoning process."
,
)
parser
.
add_argument
(
"--gpu"
,
type
=
int
,
default
=
1
,
help
=
"gpu number required by model"
"--gpu"
,
type
=
int
,
default
=
1
,
help
=
"gpu number required by
one
model"
)
args
=
parser
.
parse_args
()
...
...
@@ -47,7 +47,7 @@ if __name__ == "__main__":
worker
=
partial
(
generate_worker
,
model_path
=
args
.
model
,
sampling_params
=
sampling_params
)
dataset
=
model_map
(
worker
,
dataset
,
args
.
gpu
_per_model
)
dataset
=
model_map
(
worker
,
dataset
,
args
.
gpu
)
def
get_token_id
(
token
):
score_tokens
=
tokenizer
.
encode
(
token
,
add_special_tokens
=
False
)
...
...
codecritic/evaluation/apps_eval.py
View file @
d11a2acf
...
...
@@ -6,8 +6,6 @@ import multiprocessing
import
numpy
as
np
from
tqdm.contrib.concurrent
import
process_map
from
datasets
import
load_dataset
from
codecritic.evaluation.apps_exec
import
run_test
TIMEOUT
=
10
...
...
@@ -41,7 +39,7 @@ def check_correctness(sample, generation, timeout, debug=False):
def
test_generation
(
args
,
debug
=
False
):
apps_item
,
sample
=
args
code
=
sample
[
"
meta_clean_
code"
]
code
=
sample
[
"code"
]
curr_res
=
[
-
2
]
try
:
...
...
@@ -92,7 +90,7 @@ def evaluate_code_samples(code_samples, apps):
return
results
def
evaluate
(
code_samples
,
apps
_path
):
def
evaluate
(
code_samples
,
apps
):
"""
There are some strange bugs in apps evaluation that cannot be reproduced.
The observable issue is that the same code will yield different 'eval_result' values.
...
...
@@ -102,7 +100,6 @@ def evaluate(code_samples, apps_path):
Run twice to verify if the result is consistent.
The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result.
"""
apps
=
load_dataset
(
apps_path
)
all_results
=
[]
for
_
in
range
(
3
):
results
=
evaluate_code_samples
(
code_samples
,
apps
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment