Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
codecritic
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Ziyuan Nan
codecritic
Commits
e864d804
Commit
e864d804
authored
Feb 28, 2025
by
nanziyuan
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
r1
parent
ef55d00d
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
25 additions
and
12 deletions
+25
-12
codecritic/cli/eval_r1.py
+20
-7
codecritic/cli/test_r1.py
+1
-1
scripts/config.yaml
+1
-1
scripts/r1_test.sh
+3
-3
No files found.
codecritic/cli/eval_r1.py
View file @
e864d804
...
...
@@ -5,9 +5,13 @@ from functools import partial
import
codecritic.evaluation.metric
as
metric
from
codecritic.utils.json
import
load_jsonl
import
pprint
def
confidence
(
item
):
sign
=
1
if
item
[
"prediction"
]
else
-
1
if
item
[
"confidence"
]
is
None
:
return
-
1
return
sign
*
item
[
"confidence"
]
def
eval
(
scores
):
...
...
@@ -37,12 +41,21 @@ if __name__ == "__main__":
scores
=
load_jsonl
(
args
.
score
)
groups
=
defaultdict
(
list
)
for
item
in
scores
:
groups
[
item
[
"
dataset
"
]]
.
append
(
item
)
groups
[
item
[
"
task_id
"
]]
.
append
(
item
)
newscores
=
[]
for
dataset
,
lst
in
groups
.
items
():
results
=
eval
(
lst
)
for
r
in
results
:
r
[
"dataset"
]
=
dataset
r
[
"strategy"
]
=
"r1_qwen_7b"
del
r
[
"score_func"
]
print
(
json
.
dumps
(
r
))
pass_lst
=
[
x
[
"pass"
]
for
x
in
lst
]
if
any
(
pass_lst
):
print
(
sum
(
pass_lst
))
newscores
.
extend
(
lst
)
# results = eval(lst)
# for r in results:
# r["dataset"] = dataset
# r["strategy"] = "r1_qwen_7b"
# del r["score_func"]
# print(json.dumps(r))
print
(
len
(
newscores
))
labels
,
bscores
=
[
x
[
"pass"
]
for
x
in
newscores
],
[
1
if
x
[
"prediction"
]
else
0
for
x
in
scores
]
pprint
.
pp
(
metric
.
binary_metrics
(
labels
,
bscores
))
codecritic/cli/test_r1.py
View file @
e864d804
...
...
@@ -118,7 +118,7 @@ if __name__ == "__main__":
#with ThreadPoolExecutor(max_workers=4) as executor:
# responses = executor.map(chat_fun, prompts)
responses
=
thread_map
(
chat_fun
,
prompts
,
max_workers
=
4
)
responses
=
thread_map
(
chat_fun
,
prompts
,
max_workers
=
8
)
for
item
,
response
in
zip
(
dataset
,
responses
):
judgement
,
confidence
=
postprocess_response
(
response
)
...
...
scripts/config.yaml
View file @
e864d804
...
...
@@ -4,7 +4,7 @@ llm_kit:
router_port
:
8000
tensor_parallel_size
:
1
pipeline_parallel_size
:
1
data_parallel_size
:
4
data_parallel_size
:
8
router_timeout
:
1200
random_seeds
:
-
1111
...
...
scripts/r1_test.sh
View file @
e864d804
...
...
@@ -3,11 +3,11 @@ model="/share/collab/codemodel/models/DeepSeek-R1-Distill-Qwen-7B/"
data
=
"/nfs_global/S/nanziyuan/projects/ccc/data"
testset
=
"
${
data
}
/test/qwen25_coder_inst-apps-test.jsonl"
evalresults
=
"
${
data
}
/eval/qwen25_code_inst-apps-test-r1_7b
_test
.jsonl"
evalresults
=
"
${
data
}
/eval/qwen25_code_inst-apps-test-r1_7b.jsonl"
#
python -m llmkit_data.cli.serve --config /nfs_global/S/nanziyuan/projects/ccc/src/scripts/config.yaml &
python
-m
llmkit_data.cli.serve
--config
/nfs_global/S/nanziyuan/projects/ccc/src/scripts/config.yaml &
# vllm serve ${model} --max_model 12288
#
sleep 300s
sleep 300s
python
-m
codecritic.cli.test_r1
\
--model
${
model
}
\
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment