Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
codecritic
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Ziyuan Nan
codecritic
Commits
8db3b29c
Commit
8db3b29c
authored
Oct 06, 2024
by
nzy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
step1: evaluate code evaluate code multi-times to obtain a consistent result
parent
6d8e17dc
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
56 additions
and
14 deletions
+56
-14
step1_evaluate_code.py
+56
-14
No files found.
step1_evaluate_code.py
View file @
8db3b29c
...
...
@@ -17,12 +17,15 @@ def check_correctness(sample, generation, timeout, debug=False):
"""Check correctness of code generation with a global timeout.
The global timeout is to catch some extreme/rare cases not handled by the timeouts
inside `run_test`"""
def
_temp_run
(
sample
,
generation
,
debug
,
result
):
result
.
append
(
run_test
(
sample
,
test
=
generation
,
debug
=
debug
))
manager
=
multiprocessing
.
Manager
()
result
=
manager
.
list
()
p
=
multiprocessing
.
Process
(
target
=
_temp_run
,
args
=
(
sample
,
generation
,
debug
,
result
))
p
=
multiprocessing
.
Process
(
target
=
_temp_run
,
args
=
(
sample
,
generation
,
debug
,
result
)
)
p
.
start
()
p
.
join
(
timeout
=
timeout
+
1
)
if
p
.
is_alive
():
...
...
@@ -71,27 +74,66 @@ def test_generation(args, debug=False):
return
code_sample
def
evaluate_code_samples
(
code_samples
:
list
,
dataset_path
:
str
):
apps_eval
=
load_dataset
(
dataset_path
)
def
get_apps_item
(
item
,
apps
):
problem_id
=
item
[
"problem_id"
]
split
,
idx
=
problem_id
.
split
(
"_"
)
# get corresponding samples from APPS dataset
return
apps
[
split
][
int
(
idx
)]
def
get_apps_item
(
item
):
problem_id
=
item
[
"problem_id"
]
split
,
idx
=
problem_id
.
split
(
'_'
)
# get corresponding samples from APPS dataset
return
apps_eval
[
split
][
int
(
idx
)]
args
=
[(
get_apps_item
(
sample
),
sample
)
for
sample
in
code_samples
]
def
evaluate_code_samples
(
code_samples
,
apps
):
args
=
[(
get_apps_item
(
sample
,
apps
),
sample
)
for
sample
in
code_samples
]
cpu_num
=
multiprocessing
.
cpu_count
()
# TODO `chunksize` affects performance a lot
results
=
process_map
(
test_generation
,
args
,
max_workers
=
cpu_num
,
chunksize
=
1000
)
chunksize
=
len
(
code_samples
)
//
(
cpu_num
*
5
)
results
=
process_map
(
test_generation
,
args
,
max_workers
=
cpu_num
,
chunksize
=
chunksize
)
return
results
def
evaluate_incorrect_code_samples_again
(
results
,
apps
,
loop_num
):
"""
There are some strange bugs in apps evaluation that cannot be reproduced.
The observable issue is that the same code will yield different 'eval_result' values.
Typically, the test framework may encounter an exception or decide that the code has timed out unreasonably.
This function is an ugly workaround to address this problem:
If the function returns a timeout result or raises an exception, it will be run twice to verify if the result is consistent.
The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result.
"""
maybe_incorrect_lst
,
correct_lst
=
[],
[]
for
item
in
results
:
if
any
(
x
in
item
[
"eval_result"
]
for
x
in
(
-
1
,
-
2
)):
maybe_incorrect_lst
.
append
(
item
)
else
:
correct_lst
.
append
(
item
)
print
(
f
"maybe incorrect lst size: {len(maybe_incorrect_lst)}"
)
for
_
in
range
(
loop_num
):
if
len
(
maybe_incorrect_lst
)
==
0
:
break
new_results
=
evaluate_code_samples
(
maybe_incorrect_lst
,
apps
)
for
i
,
(
old_item
,
new_item
)
in
enumerate
(
zip
(
maybe_incorrect_lst
,
new_results
)):
old_eval
,
new_eval
=
old_item
[
"eval_results"
],
new_item
[
"eval_results"
]
if
old_eval
==
new_eval
:
item
=
maybe_incorrect_lst
.
pop
(
i
)
correct_lst
.
append
(
item
)
else
:
maybe_incorrect_lst
[
i
]
=
new_item
assert
len
(
results
)
==
len
(
correct_lst
),
"cannot correctly evaluate codes"
+
str
(
maybe_incorrect_lst
)
return
correct_lst
def
evaluate
(
code_sample_path
,
dataset_path
,
output_path
):
code_samples
=
load_jsonl
(
code_sample_path
)
results
=
evaluate_code_samples
(
code_samples
,
dataset_path
)
apps
=
load_dataset
(
dataset_path
)
results
=
evaluate_code_samples
(
code_samples
,
apps
)
results
=
evaluate_incorrect_code_samples_again
(
results
,
apps
,
5
)
save_jsonl
(
results
,
output_path
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment