Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
codecritic
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Ziyuan Nan
codecritic
Commits
36054b07
Commit
36054b07
authored
Feb 10, 2025
by
nzy
Browse files
Options
Browse Files
Download
Plain Diff
Merge branch 'main' of
http://62.234.201.16/nzy/codecritic
parents
030e1e12
2a43e44e
Show whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
301 additions
and
109 deletions
+301
-109
codecritic/cli/algolr.py
+32
-3
codecritic/cli/eval.py
+25
-43
codecritic/cli/test_genrm.py
+21
-6
codecritic/cli/test_orm.py
+14
-3
codecritic/cli/test_train.py
+76
-0
codecritic/dataset/algolr_prompt.py
+4
-4
codecritic/evaluation/apps_eval.py
+2
-2
codecritic/evaluation/apps_exec.py
+7
-7
codecritic/evaluation/metric.py
+3
-0
scripts/algolr.sh
+16
-5
scripts/gen_dataset.sh
+1
-1
scripts/raw.sh
+62
-0
scripts/train_orm.sh
+38
-35
No files found.
codecritic/cli/algolr.py
View file @
36054b07
import
argparse
from
collections
import
defaultdict
from
functools
import
partial
import
pprint
import
random
from
vllm
import
SamplingParams
from
datasets
import
load_dataset
...
...
@@ -51,11 +53,14 @@ if __name__ == "__main__":
worker
=
partial
(
generate_worker
,
model_path
=
args
.
model
,
sampling_params
=
sampling_params
)
hint_responses
=
model_map
(
worker
,
hint_prompts
,
args
.
tp
)
pprint
.
pp
(
hint_responses
[
0
])
hints
=
[
promptlib
.
postprocess_to_hint
(
x
)
for
x
in
hint_responses
]
# hints: {"dataset"..., "task_id": ..., "solution_id": ..., "hints": ...}
# save_jsonl(hint_responses, args.output + ".hint_responses")
save_jsonl
(
hints
,
args
.
output
+
".hints"
)
# hints = load_jsonl(args.output + ".hints")
hints_dict
=
defaultdict
(
dict
)
for
item
in
hints
:
...
...
@@ -73,7 +78,7 @@ if __name__ == "__main__":
reason_prompts
.
append
(
chosen_prompt
)
# rejected
rejected_hints
=
hints_dict
[
task_id
][
rejected_id
]
rejected_hints
=
hints_dict
[
task_id
][
rejected_id
]
[
"hint"
]
rejected_prompt
=
promptlib
.
process_to_reason_prompt
(
rejected
,
rejected_hints
)
reason_prompts
.
append
(
rejected_prompt
)
...
...
@@ -86,6 +91,8 @@ if __name__ == "__main__":
worker
=
partial
(
generate_worker
,
model_path
=
args
.
model
,
sampling_params
=
sampling_params
)
reason_responses
=
model_map
(
worker
,
reason_prompts
,
args
.
tp
)
pprint
.
pp
(
reason_responses
[
0
])
save_jsonl
(
reason_responses
,
args
.
output
+
".reason"
)
# Step3 Verify reasoning results
# add prompt "correct the code based the reasoning"
...
...
@@ -114,6 +121,7 @@ if __name__ == "__main__":
worker
=
partial
(
generate_worker
,
model_path
=
args
.
model
,
sampling_params
=
sampling_params
)
verify_responses
=
model_map
(
worker
,
reason_responses
,
args
.
tp
)
pprint
.
pp
(
verify_responses
[
0
])
print
(
"verify response size: {}"
.
format
(
len
(
verify_responses
)))
# postprocess verify_response.
...
...
@@ -148,8 +156,7 @@ if __name__ == "__main__":
print
(
"Corrects (execution consistent) size: {}"
.
format
(
len
(
corrects
)))
print
(
"Incorrects (execution consistent) size: {}"
.
format
(
len
(
incorrects
)))
# Step4 Remove hints and Reformat to a SFT dataset
# extract reasoning sets
# Step4 Reformat to a SFT dataset
sft
=
[]
for
item
in
verify_passed
:
...
...
@@ -169,4 +176,26 @@ if __name__ == "__main__":
sft
.
append
(
line
)
print
(
"Size of sft dataset: {}"
.
format
(
len
(
sft
)))
pprint
.
pp
(
sft
[
0
])
save_jsonl
(
sft
,
args
.
output
)
# Step5 keep 1 rationale for 1 solution
task_solution_map
=
defaultdict
(
lambda
:
defaultdict
(
list
))
for
entry
in
sft
:
task_id
=
entry
[
"task_id"
]
solution_id
=
entry
[
"solution_id"
]
task_solution_map
[
task_id
][
solution_id
.
split
(
"_"
)[
0
]]
.
append
(
entry
)
# Step 2: Keep only one reasoning for each solution
processed_dataset
=
[]
for
task_id
,
solution_map
in
task_solution_map
.
items
():
for
solution
,
reasoning_list
in
solution_map
.
items
():
if
len
(
reasoning_list
)
>
1
:
selected_index
=
random
.
choice
(
range
(
1
,
len
(
reasoning_list
)))
processed_dataset
.
append
(
reasoning_list
[
selected_index
])
else
:
processed_dataset
.
append
(
reasoning_list
[
0
])
save_jsonl
(
processed_dataset
,
args
.
output
.
split
(
'.'
)[
0
]
+
"-filtered.jsonl"
)
codecritic/cli/eval.py
View file @
36054b07
import
argparse
from
collections
import
defaultdict
import
json
from
functools
import
partial
from
pathlib
import
Path
import
codecritic.evaluation.metric
as
metric
from
codecritic.utils.json
import
load_jsonl
,
save_jsonl
def
eval
(
samples_path
):
model
,
testset
=
samples_path
.
stem
.
split
(
'-'
)[:
2
]
def
f
(
item
):
item
[
"model"
]
=
model
item
[
"testset"
]
=
testset
samples
=
load_jsonl
(
samples_path
)
from
codecritic.utils.json
import
load_jsonl
def
eval
(
scores
):
ks
=
list
(
range
(
1
,
17
))
results
=
[]
results
.
append
(
metric
.
pass_at_k
(
samples
,
ks
))
results
.
append
(
metric
.
top_at_k
(
samples
,
ks
,
metric
.
postive_and_negative
))
results
.
append
(
metric
.
top_at_k
(
samples
,
ks
,
metric
.
positive_only
))
# results.extend(metric.pass_at_k(scores, ks))
# results.extend(metric.pass_at_k(scores, [50]))
# results.extend(metric.top_at_k(scores, ks, metric.positive_only))
if
"negative_score"
in
scores
[
0
]:
results
.
extend
(
metric
.
top_at_k
(
scores
,
ks
,
metric
.
postive_and_negative
))
for
i
in
range
(
4
):
threshold
=
0.5
+
i
*
0.1
score_func
=
partial
(
metric
.
pos_neg_filter_uncertain
,
threshold
=
threshold
)
results
.
append
(
metric
.
top_at_k
(
sampl
es
,
ks
,
score_func
))
#
for i in range(4):
#
threshold = 0.5 + i * 0.1
#
score_func = partial(metric.pos_neg_filter_uncertain, threshold=threshold)
# results.extend(metric.top_at_k(scor
es, ks, score_func))
return
list
(
map
(
f
,
results
))
return
results
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--sample_dir"
,
type
=
str
,
default
=
None
,
help
=
"Path to the directory containing samples. If not provided, cached results will be used."
)
parser
.
add_argument
(
"--out_dir"
,
type
=
str
,
help
=
"path/to/output_dir"
)
parser
.
add_argument
(
"--score_func"
,
type
=
str
,
default
=
"all"
,
choices
=
[
"all"
,
"posonly"
,
"posneg"
,
"posneg_filter"
],
# Add valid options
help
=
"Select the scoring function to use. Default: 'all'."
)
parser
.
add_argument
(
"--plot"
,
type
=
str
,
help
=
"path/to/plot"
)
parser
.
add_argument
(
"--score"
,
type
=
str
,
help
=
"path/to/score"
)
args
=
parser
.
parse_args
()
outdir
=
Path
(
args
.
out_dir
)
if
args
.
sample_dir
:
for
samples_path
in
Path
(
args
.
sample_dir
)
.
glob
(
"*.jsonl"
):
out_path
=
outdir
/
(
samples_path
.
stem
+
"-eval.jsonl"
)
if
not
out_path
.
exists
():
eval_results
=
eval
(
samples_path
)
save_jsonl
(
eval_results
,
out_path
)
scores
=
load_jsonl
(
args
.
score
)
groups
=
defaultdict
(
list
)
for
item
in
scores
:
groups
[
item
[
"dataset"
]]
.
append
(
item
)
for
out_path
in
outdir
.
glob
(
"*.jsonl"
):
pass
\ No newline at end of file
for
dataset
,
lst
in
groups
.
items
():
results
=
eval
(
lst
)
for
r
in
results
:
r
[
"dataset"
]
=
dataset
print
(
json
.
dumps
(
r
))
codecritic/cli/test_genrm.py
View file @
36054b07
import
argparse
from
collections
import
defaultdict
from
functools
import
partial
import
os
from
transformers
import
AutoTokenizer
from
vllm
import
SamplingParams
from
codecritic.dataset.genrm_prompt
import
THINK_MESSAGE
,
JUDGE_MESSAGE
,
JUDGE_TOEKNS
from
codecritic.dataset.genrm_prompt
import
JUDGE_MESSAGE
,
JUDGE_TOEKNS
from
codecritic.utils.inference
import
generate_worker
,
score_worker
from
codecritic.utils.parallel
import
model_map
from
codecritic.utils.json
import
load_jsonl
,
save_jsonl
import
codecritic.dataset.algolr_prompt
as
algolr_prompt
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--model"
,
type
=
str
,
help
=
"path/to/model"
)
parser
.
add_argument
(
"--
sample"
,
type
=
str
,
help
=
"path/to/sample
"
)
parser
.
add_argument
(
"--
testset"
,
type
=
str
,
help
=
"path/to/testset
"
)
parser
.
add_argument
(
"--output"
,
type
=
str
,
help
=
"path/to/score"
)
parser
.
add_argument
(
"--reasoning"
,
action
=
"store_true"
,
help
=
"enable reasoning"
)
parser
.
add_argument
(
...
...
@@ -31,11 +33,24 @@ if __name__ == "__main__":
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"false"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
model
)
dataset
=
load_jsonl
(
args
.
sample
)
dataset
=
load_jsonl
(
args
.
testset
)
ds
=
defaultdict
(
list
)
for
item
in
dataset
:
ds
[
item
[
"task_id"
]]
.
append
(
item
)
unsolvable
=
[]
dataset
=
[]
for
task_id
,
items
in
ds
.
items
():
if
all
([
not
x
[
"pass"
]
for
x
in
items
]):
for
item
in
items
:
item
[
"positive_score"
]
=
0
item
[
"negative_score"
]
=
0
unsolvable
.
extend
(
items
)
else
:
dataset
.
extend
(
items
)
if
args
.
reasoning
:
for
item
in
dataset
:
item
[
"messages"
]
.
append
(
THINK_MESSAGE
)
dataset
=
[
algolr_prompt
.
process_to_reason_prompt
(
x
,
None
)
for
x
in
dataset
]
sampling_params
=
SamplingParams
(
n
=
1
,
...
...
@@ -68,4 +83,4 @@ if __name__ == "__main__":
)
dataset
=
model_map
(
worker
,
dataset
,
args
.
tp
)
save_jsonl
(
dataset
,
args
.
output
)
save_jsonl
(
dataset
+
unsolvable
,
args
.
output
)
codecritic/cli/test_orm.py
View file @
36054b07
import
argparse
from
collections
import
defaultdict
import
json
import
requests
from
tqdm
import
tqdm
...
...
@@ -21,18 +22,28 @@ def get_rewards_from_server(server_url: str, messages: list[str]):
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--model"
,
type
=
str
,
help
=
"path/to/model"
)
parser
.
add_argument
(
"--
sample"
,
type
=
str
,
help
=
"path/to/sample
"
)
parser
.
add_argument
(
"--
testset"
,
type
=
str
,
help
=
"path/to/testset
"
)
parser
.
add_argument
(
"--output"
,
type
=
str
,
help
=
"path/to/score"
)
args
=
parser
.
parse_args
()
# compute score
dataset
=
load_jsonl
(
args
.
sample
)
dataset
=
load_jsonl
(
args
.
testset
)
ds
=
defaultdict
(
list
)
for
item
in
dataset
:
ds
[
item
[
"task_id"
]]
.
append
(
item
)
for
task_id
,
items
in
ds
.
items
():
if
all
([
not
x
[
"pass"
]
for
x
in
items
]):
for
item
in
items
:
item
[
"positive_score"
]
=
0
server_url
=
"http://0.0.0.0:5000/get_reward"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
model
)
for
item
in
tqdm
(
dataset
):
if
'positive_score'
not
in
item
:
query
=
tokenizer
.
apply_chat_template
(
item
[
"messages"
],
tokenize
=
False
)
score
=
get_rewards_from_server
(
server_url
,
[
query
])[
0
]
item
[
"
score"
]
=
score
item
[
"positive_
score"
]
=
score
save_jsonl
(
dataset
,
args
.
output
)
codecritic/cli/test_train.py
0 → 100644
View file @
36054b07
import
argparse
from
functools
import
partial
import
os
import
pprint
from
transformers
import
AutoTokenizer
from
vllm
import
SamplingParams
from
codecritic.dataset.genrm_prompt
import
JUDGE_MESSAGE
,
JUDGE_TOEKNS
from
codecritic.utils.inference
import
generate_worker
,
score_worker
from
codecritic.utils.parallel
import
model_map
from
codecritic.utils.json
import
load_jsonl
,
save_jsonl
from
codecritic.evaluation.metric
import
postive_and_negative
,
binary_metrics
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--model"
,
type
=
str
,
help
=
"path/to/model"
)
parser
.
add_argument
(
"--trainset"
,
type
=
str
,
help
=
"path/to/testset"
)
parser
.
add_argument
(
"--output"
,
type
=
str
,
help
=
"path/to/output"
)
parser
.
add_argument
(
"--reason_max_tokens"
,
type
=
int
,
default
=
4096
,
help
=
"maximum number of tokens allowed for the reasoning process."
,
)
parser
.
add_argument
(
"--tp"
,
type
=
int
,
default
=
1
,
help
=
"tensor parallel"
)
args
=
parser
.
parse_args
()
os
.
environ
[
"TOKENIZERS_PARALLELISM"
]
=
"false"
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
model
)
dataset
=
load_jsonl
(
args
.
trainset
)[:
1000
]
for
item
in
dataset
:
item
[
"messages"
]
=
item
[
"question"
]
item
[
"pass"
]
=
(
item
[
"response"
][
-
1
][
"content"
]
==
"Yes"
)
sampling_params
=
SamplingParams
(
n
=
1
,
temperature
=
0
,
top_p
=
0.95
,
max_tokens
=
args
.
reason_max_tokens
,
)
worker
=
partial
(
generate_worker
,
model_path
=
args
.
model
,
sampling_params
=
sampling_params
)
dataset
=
model_map
(
worker
,
dataset
,
args
.
tp
)
def
get_token_id
(
token
):
score_tokens
=
tokenizer
.
encode
(
token
,
add_special_tokens
=
False
)
assert
len
(
score_tokens
)
==
1
return
score_tokens
[
0
]
positive_token
=
get_token_id
(
JUDGE_TOEKNS
[
"positive"
])
negative_token
=
get_token_id
(
JUDGE_TOEKNS
[
"negative"
])
for
item
in
dataset
:
item
[
"messages"
]
.
append
(
JUDGE_MESSAGE
)
worker
=
partial
(
score_worker
,
model_path
=
args
.
model
,
positive_token
=
positive_token
,
negative_token
=
negative_token
,
)
dataset
=
model_map
(
worker
,
dataset
,
args
.
tp
)
scores
=
[
postive_and_negative
(
item
)
for
item
in
dataset
]
labels
=
[
item
[
"pass"
]
for
item
in
dataset
]
pprint
.
pp
(
binary_metrics
(
labels
,
scores
))
save_jsonl
(
dataset
,
args
.
output
)
codecritic/dataset/algolr_prompt.py
View file @
36054b07
...
...
@@ -152,10 +152,7 @@ def remove_hint(item):
def
extract_conclusion_and_code
(
response
):
# Extract conclusion
if
'Conclusion:'
not
in
response
:
conclusion
=
None
print
(
"not found conclusion
\n
{}"
.
format
(
response
))
else
:
try
:
conclusion_line
=
[
line
for
line
in
response
.
split
(
'
\n
'
)
if
line
.
startswith
(
'Conclusion:'
)][
0
]
conclusion_str
=
conclusion_line
.
split
(
': '
)[
1
]
.
strip
()
.
lower
()
...
...
@@ -166,6 +163,9 @@ def extract_conclusion_and_code(response):
else
:
print
(
"llm doesn't draw to a conclusion
\n
{}"
.
format
(
response
))
conclusion
=
None
except
Exception
as
e
:
print
(
"not found conclusion
\n
{}
\n
{}"
.
format
(
response
,
e
))
conclusion
=
None
# Extract corrected code if conclusion is 'No'
corrected_code
=
""
...
...
codecritic/evaluation/apps_eval.py
View file @
36054b07
...
...
@@ -84,7 +84,7 @@ def evaluate_code_samples(code_samples, apps):
cpu_num
=
multiprocessing
.
cpu_count
()
//
2
chunksize
=
max
(
len
(
code_samples
)
//
(
cpu_num
*
10
),
1
)
results
=
process_map
(
test_generation
,
args
,
max_workers
=
cpu_num
,
chunksize
=
chunksize
test_generation
,
args
,
max_workers
=
cpu_num
,
chunksize
=
1
)
return
results
...
...
@@ -100,7 +100,7 @@ def evaluate(code_samples, apps):
The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result.
"""
all_results
=
[]
for
_
in
range
(
2
):
for
_
in
range
(
1
):
results
=
evaluate_code_samples
(
code_samples
,
apps
)
all_results
.
append
(
results
)
...
...
codecritic/evaluation/apps_exec.py
View file @
36054b07
...
...
@@ -350,9 +350,9 @@ def run_test(sample, test=None, debug=False):
# try by converting the stuff into split up list
if
isinstance
(
in_outs
[
"outputs"
][
index
],
list
):
for
tmp_index
,
i
in
enumerate
(
in_outs
[
"outputs"
][
index
]):
in_outs
[
"outputs"
][
index
][
tmp_index
]
=
se
t
(
i
.
split
())
in_outs
[
"outputs"
][
index
][
tmp_index
]
=
lis
t
(
i
.
split
())
else
:
in_outs
[
"outputs"
][
index
]
=
se
t
(
in_outs
[
"outputs"
][
index
]
.
split
())
in_outs
[
"outputs"
][
index
]
=
lis
t
(
in_outs
[
"outputs"
][
index
]
.
split
())
try
:
tmp_result
=
(
output
==
in_outs
[
"outputs"
][
index
])
...
...
@@ -371,14 +371,14 @@ def run_test(sample, test=None, debug=False):
output
[
tmp_index
]
=
i
.
split
()
output
=
list
(
filter
(
len
,
output
))
for
tmp_index
,
i
in
enumerate
(
output
):
output
[
tmp_index
]
=
set
(
i
)
output
[
tmp_index
]
=
list
(
i
)
else
:
output
=
output
.
split
()
output
=
list
(
filter
(
len
,
output
))
output
=
se
t
(
output
)
output
=
lis
t
(
output
)
try
:
tmp_result
=
(
set
(
frozenset
(
s
)
for
s
in
output
)
==
set
(
frozense
t
(
s
)
for
s
in
in_outs
[
"outputs"
][
index
]))
tmp_result
=
(
list
(
list
(
s
)
for
s
in
output
)
==
list
(
lis
t
(
s
)
for
s
in
in_outs
[
"outputs"
][
index
]))
except
Exception
as
e
:
if
debug
:
print
(
f
"Failed check5 exception = {e}"
)
...
...
@@ -386,8 +386,8 @@ def run_test(sample, test=None, debug=False):
# if they are all numbers, round so that similar numbers are treated as identical
try
:
tmp_result
=
tmp_result
or
(
set
(
frozense
t
(
round
(
float
(
t
),
3
)
for
t
in
s
)
for
s
in
output
)
==
\
set
(
frozense
t
(
round
(
float
(
t
),
3
)
for
t
in
s
)
for
s
in
in_outs
[
"outputs"
][
index
]))
tmp_result
=
tmp_result
or
(
list
(
lis
t
(
round
(
float
(
t
),
3
)
for
t
in
s
)
for
s
in
output
)
==
\
list
(
lis
t
(
round
(
float
(
t
),
3
)
for
t
in
s
)
for
s
in
in_outs
[
"outputs"
][
index
]))
except
Exception
as
e
:
if
debug
:
print
(
f
"Failed check6 exception = {e}"
)
...
...
codecritic/evaluation/metric.py
View file @
36054b07
...
...
@@ -50,6 +50,9 @@ def positive_only(item):
def
postive_and_negative
(
item
):
pos
=
item
[
"positive_score"
]
neg
=
item
[
"negative_score"
]
if
(
pos
+
neg
)
==
0
:
return
0
else
:
return
pos
/
(
pos
+
neg
)
...
...
scripts/algolr.sh
View file @
36054b07
...
...
@@ -3,18 +3,21 @@ set -xe
model
=
"/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/"
project
=
"/lustre/S/nanziyuan/projects/ccc"
modelname
=
"qwen25_coder_inst"
data
=
"
${
project
}
/data"
trainset
=
"
${
project
}
/data
/train/
${
modelname
}
-apps-train.jsonl"
testset
=
"
${
project
}
/data
/test/
${
modelname
}
-apps-test.jsonl"
trainset
=
"
${
data
}
/train/
${
modelname
}
-apps-train.jsonl"
testset
=
"
${
data
}
/test/
${
modelname
}
-apps-test.jsonl"
train_selected_pairs
=
"
${
project
}
/data
/train/
${
modelname
}
-apps-train-selected_pairs.jsonl"
train_selected_pairs
=
"
${
data
}
/train/
${
modelname
}
-apps-train-selected_pairs.jsonl"
apps
=
"/lustre/S/nanziyuan/datasets/apps/"
sft
=
"
${
project
}
/data
/train/
${
modelname
}
-sft.jsonl"
sft
=
"
${
data
}
/train/
${
modelname
}
-sft.jsonl"
ftmodel
=
"
${
project
}
/model/qwen25_coder_inst_7b-algolr"
testset
=
"
${
data
}
/test/qwen25_coder_inst-apps-test.jsonl"
evalresults
=
"
${
data
}
/eval/qwen25_code_inst-apps-test-algolr-score.jsonl"
#
##
export CUDA_VISIBLE_DEVICES=0,1,2,3
# export CUDA_VISIBLE_DEVICES=0,1,2,3
python
-m
codecritic.cli.algolr
\
...
...
@@ -50,3 +53,11 @@ openrlhf.cli.train_sft \
--load_checkpoint
\
--gradient_checkpointing
\
--use_tensorboard
"
${
ftmodel
}
_log"
python
-m
codecritic.cli.test_genrm
\
--model
${
ftmodel
}
\
--testset
${
testset
}
\
--output
${
evalresults
}
\
--reasoning
\
--tp
1
scripts/gen_dataset.sh
View file @
36054b07
...
...
@@ -12,7 +12,7 @@ train_selected_pairs="${project}/data/train/${modelname}-apps-train-selected_pai
reward_ds
=
"
${
project
}
/data/train/
${
modelname
}
-apps-train-reward_dataset.jsonl"
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3
#
export CUDA_VISIBLE_DEVICES=0,1,2,3
## Sampling
## APPS
...
...
scripts/raw.sh
0 → 100644
View file @
36054b07
set
-xe
model
=
"/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/"
project
=
"/lustre/S/nanziyuan/projects/ccc"
modelname
=
"qwen25_coder_inst"
trainset
=
"
${
project
}
/data/train/
${
modelname
}
-apps-train.jsonl"
testset
=
"
${
project
}
/data/test/
${
modelname
}
-apps-test.jsonl"
train_selected_pairs
=
"
${
project
}
/data/train/
${
modelname
}
-apps-train-selected_pairs.jsonl"
apps
=
"/lustre/S/nanziyuan/datasets/apps/"
sft
=
"
${
project
}
/data/train/
${
modelname
}
-sft.jsonl"
ftmodel
=
"
${
project
}
/model/qwen25_coder_inst_7b-algolr"
testset
=
"/lustre/S/nanziyuan/projects/ccc/data/test/qwen25_coder_inst-apps-test.jsonl"
evalresults
=
"/lustre/S/nanziyuan/projects/ccc/data/eval/qwen25_code_inst-apps-test-genrm-score.jsonl"
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3,4,5,6,7
# python -m codecritic.cli.algolr \
# --model ${model} \
# --dataset ${trainset} \
# --pairinfo ${train_selected_pairs} \
# --apps ${apps} \
# --output ${sft} \
# --level beginner \
# --tp 1
# deepspeed --module \
# openrlhf.cli.train_sft \
# --max_len 4096 \
# --dataset ${sft} \
# --input_key question \
# --output_key response \
# --apply_chat_template \
# --train_batch_size 256 \
# --micro_train_batch_size 2 \
# --max_samples 500000 \
# --pretrain ${model} \
# --save_path ${ftmodel} \
# --save_steps -1 \
# --logging_steps 1 \
# --eval_steps -1 \
# --zero_stage 2 \
# --max_epochs 1 \
# --bf16 \
# --flash_attn \
# --learning_rate 5e-6 \
# --load_checkpoint \
# --gradient_checkpointing \
# --use_tensorboard "${ftmodel}_log"
python
-m
codecritic.cli.test_genrm
\
--model
${
model
}
\
--testset
${
testset
}
\
--output
${
evalresults
}
\
--reasoning
\
--tp
1
scripts/train_orm.sh
View file @
36054b07
...
...
@@ -6,6 +6,9 @@ project="/lustre/S/nanziyuan/projects/ccc"
model
=
"/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/"
ftmodel
=
"
${
project
}
/model/qwen25_coder_inst_7b-orm"
testset
=
"/lustre/S/nanziyuan/projects/ccc/data/test/qwen25_coder_inst-apps-test.jsonl"
evalresults
=
"/lustre/S/nanziyuan/projects/ccc/data/eval/qwen25_code_inst-apps-test-orm-score.jsonl"
deepspeed
--module
\
openrlhf.cli.train_rm
\
--save_path
${
ftmodel
}
\
...
...
@@ -31,41 +34,41 @@ openrlhf.cli.train_rm \
--use_tensorboard
"
${
ftmodel
}
_log"
#
start_server() {
#
echo "Starting server..."
#
CUDA_VISIBLE_DEVICES=0 \
#
python -m openrlhf.cli.serve_rm \
# --reward_pretrain ${
model} \
#
--normalize_reward \
#
--port 5000 \
#
--bf16 \
#
--max_len 8192 &
#
SERVER_PID=$!
#
echo "Server started with PID: $SERVER_PID"
#
}
start_server
()
{
echo
"Starting server..."
CUDA_VISIBLE_DEVICES
=
0
\
python
-m
openrlhf.cli.serve_rm
\
--reward_pretrain
${
ft
model
}
\
--normalize_reward
\
--port
5000
\
--bf16
\
--max_len
8192 &
SERVER_PID
=
$!
echo
"Server started with PID:
$SERVER_PID
"
}
#
#
Function to start the client
#
start_client() {
#
echo "Starting client..."
# python -m codecritic.cli.run_rm_test
\
# --model ${
model} \
# --test "${datasets}/sample/min_test.jsonl"
\
# --apps /lustre/S/nanziyuan/datasets/apps/
#
CLIENT_EXIT_CODE=$?
#
echo "Client finished with exit code: $CLIENT_EXIT_CODE"
#
}
# Function to start the client
start_client
()
{
echo
"Starting client..."
python
-m
codecritic.cli.test_orm
\
--model
${
ft
model
}
\
--testset
${
testset
}
\
--output
${
evalresults
}
CLIENT_EXIT_CODE
=
$?
echo
"Client finished with exit code:
$CLIENT_EXIT_CODE
"
}
#
#
Function to stop the server
#
stop_server() {
#
echo "Stopping server..."
#
kill -SIGINT $SERVER_PID
#
wait $SERVER_PID 2>/dev/null
#
echo "Server stopped."
#
}
# Function to stop the server
stop_server
()
{
echo
"Stopping server..."
kill
-SIGINT
$SERVER_PID
wait
$SERVER_PID
2>/dev/null
echo
"Server stopped."
}
#
start_server
#
#
Give the server some time to initialize (optional)
#
sleep 60
#
start_client
#
stop_server
#
echo "Execution complete."
start_server
# Give the server some time to initialize (optional)
sleep 60
start_client
stop_server
echo
"Execution complete."
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment