Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
codecritic
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Ziyuan Nan
codecritic
Commits
94ea5cb1
Commit
94ea5cb1
authored
Dec 28, 2024
by
nzy
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
new evaluation logic
parent
b1db6cb1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
43 additions
and
48 deletions
+43
-48
codecritic/evaluation/apps_eval.py
+43
-48
No files found.
codecritic/evaluation/apps_eval.py
View file @
94ea5cb1
...
...
@@ -6,9 +6,9 @@ import multiprocessing
import
numpy
as
np
from
tqdm.contrib.concurrent
import
process_map
from
datasets
import
load_dataset
from
codecritic.evaluation.apps_exec
import
run_test
from
codecritic.utils.json
import
save_jsonl
from
codecritic.dataset.code
import
extract_code
TIMEOUT
=
10
...
...
@@ -41,7 +41,7 @@ def check_correctness(sample, generation, timeout, debug=False):
def
test_generation
(
args
,
debug
=
False
):
apps_item
,
sample
=
args
code
=
extract_code
(
sample
[
"response"
][
0
][
"content"
])
code
=
sample
[
"meta_clean_code"
]
curr_res
=
[
-
2
]
try
:
...
...
@@ -68,18 +68,20 @@ def test_generation(args, debug=False):
problem_result
=
np
.
asarray
(
curr_res
)
return
{
**
sample
,
"code"
:
code
,
"eval_result"
:
bool
(
np
.
all
(
problem_result
>
0
)),
"testcase"
:
curr_res
"task_id"
:
sample
[
"task_id"
],
"solution_id"
:
sample
[
"solution_id"
],
"pass"
:
bool
(
np
.
all
(
problem_result
>
0
)),
"timeout"
:
bool
(
-
1
in
curr_res
),
"compilerr"
:
bool
(
-
2
in
curr_res
),
}
def
evaluate_code_samples
(
code_samples
,
apps
):
args
=
[]
for
sample
in
code_samples
:
problem_id
=
sample
[
"problem_id"
]
args
.
append
((
apps
[
"test"
][
int
(
problem_id
)],
sample
))
task_id
=
sample
[
"task_id"
]
split
,
idx
=
task_id
.
split
(
'-'
)
args
.
append
((
apps
[
split
][
int
(
idx
)],
sample
))
cpu_num
=
multiprocessing
.
cpu_count
()
chunksize
=
max
(
len
(
code_samples
)
//
(
cpu_num
*
5
),
1
)
...
...
@@ -90,7 +92,7 @@ def evaluate_code_samples(code_samples, apps):
return
results
def
evaluate
_incorrect_code_samples_again
(
results
,
apps
,
loop_num
):
def
evaluate
(
code_samples
,
apps_path
):
"""
There are some strange bugs in apps evaluation that cannot be reproduced.
The observable issue is that the same code will yield different 'eval_result' values.
...
...
@@ -100,42 +102,35 @@ def evaluate_incorrect_code_samples_again(results, apps, loop_num):
Run twice to verify if the result is consistent.
The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result.
"""
maybe_incorrect_lst
,
correct_lst
=
[],
[]
for
item
in
results
:
if
any
(
x
in
item
[
"testcase"
]
for
x
in
(
-
1
,
-
2
)):
maybe_incorrect_lst
.
append
(
item
)
apps
=
load_dataset
(
apps_path
)
all_results
=
[]
for
_
in
range
(
3
):
results
=
evaluate_code_samples
(
code_samples
,
apps
)
all_results
.
append
(
results
)
final_results
=
[]
for
lst
in
map
(
list
,
zip
(
*
all_results
)):
assert
len
(
set
(
x
[
"task_id"
]
for
x
in
lst
))
==
1
,
"Mismatched task_id"
assert
len
(
set
(
x
[
"solution_id"
]
for
x
in
lst
))
==
1
,
"Mismatched solution_id"
task_id
,
solution_id
=
lst
[
0
][
"task_id"
],
lst
[
0
][
"solution_id"
]
if
all
(
x
[
"compilerr"
]
for
x
in
lst
):
is_pass
=
False
else
:
correct_lst
.
append
(
item
)
for
_
in
range
(
loop_num
):
if
len
(
maybe_incorrect_lst
)
==
0
:
break
new_results
=
evaluate_code_samples
(
maybe_incorrect_lst
,
apps
)
print
(
f
"maybe incorrect lst size: {len(maybe_incorrect_lst)}"
)
check_lst
=
[]
for
i
in
range
(
len
(
new_results
)):
old_item
,
new_item
=
maybe_incorrect_lst
[
i
],
new_results
[
i
]
old_eval
,
new_eval
=
old_item
[
"eval_result"
],
new_item
[
"eval_result"
]
if
old_eval
==
new_eval
:
correct_lst
.
append
(
old_item
)
else
:
check_lst
.
append
(
new_item
)
print
(
old_item
[
"problem_id"
],
old_eval
,
new_item
[
"problem_id"
],
new_eval
)
maybe_incorrect_lst
=
check_lst
if
len
(
results
)
!=
len
(
correct_lst
):
save_jsonl
(
maybe_incorrect_lst
,
"debug.jsonl"
)
# raise ValueError("cannot correctly evaluate codes")
print
(
"cannot correctly evalute code. see debug.jsonl"
)
if
len
(
maybe_incorrect_lst
)
<
5
:
correct_lst
.
extend
(
maybe_incorrect_lst
)
return
correct_lst
def
evaluate
(
code_samples
,
apps
):
results
=
evaluate_code_samples
(
code_samples
,
apps
)
results
=
evaluate_incorrect_code_samples_again
(
results
,
apps
,
10
)
return
results
# If there is a compilation error in any of the multiple runs, treat it as an exception and remove it.
lst
=
[
x
for
x
in
lst
if
not
x
[
"compilerr"
]]
is_pass
=
all
(
x
[
"pass"
]
for
x
in
lst
)
final_results
.
append
({
"task_id"
:
task_id
,
"solution_id"
:
solution_id
,
"pass"
:
is_pass
})
for
sample
,
is_pass
in
zip
(
code_samples
,
final_results
):
assert
sample
[
"task_id"
]
==
is_pass
[
"task_id"
],
"Mismatched task_id"
assert
sample
[
"solution_id"
]
==
is_pass
[
"solution_id"
],
"Mismatched solution_id"
sample
[
"pass"
]
=
is_pass
[
"pass"
]
return
code_samples
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment