Skip to content
Projects
Groups
Snippets
Help
This project
Loading...
Sign in / Register
Toggle navigation
codecritic
Overview
Overview
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
CI / CD
CI / CD
Pipelines
Jobs
Schedules
Charts
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Jobs
Commits
Issue Boards
Open sidebar
Ziyuan Nan
codecritic
Commits
2a43e44e
Commit
2a43e44e
authored
Jan 14, 2025
by
nanziyuan
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
fix apps bugs & filter rationale
parent
97085438
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
139 additions
and
111 deletions
+139
-111
codecritic/cli/algolr.py
+47
-20
codecritic/evaluation/apps_eval.py
+2
-2
codecritic/evaluation/apps_exec.py
+26
-26
scripts/algolr.sh
+40
-39
scripts/gen_dataset.sh
+1
-1
scripts/train_orm.sh
+23
-23
No files found.
codecritic/cli/algolr.py
View file @
2a43e44e
import
argparse
from
collections
import
defaultdict
from
functools
import
partial
import
pprint
import
random
from
vllm
import
SamplingParams
from
datasets
import
load_dataset
...
...
@@ -35,28 +37,30 @@ if __name__ == "__main__":
ds
[
item
[
"task_id"
]][
item
[
"solution_id"
]]
=
item
# Step1 Generate hints
#
hint_prompts = []
#
for pair in pairinfo:
#
task_id, chosen_id, rejected_id = pair["task_id"], pair["chosen"], pair["rejected"]
#
chosen, rejected = ds[task_id][chosen_id], ds[task_id][rejected_id]
#
prompt = promptlib.process_to_hint_prompt(chosen, rejected, args.level)
#
hint_prompts.append(prompt)
hint_prompts
=
[]
for
pair
in
pairinfo
:
task_id
,
chosen_id
,
rejected_id
=
pair
[
"task_id"
],
pair
[
"chosen"
],
pair
[
"rejected"
]
chosen
,
rejected
=
ds
[
task_id
][
chosen_id
],
ds
[
task_id
][
rejected_id
]
prompt
=
promptlib
.
process_to_hint_prompt
(
chosen
,
rejected
,
args
.
level
)
hint_prompts
.
append
(
prompt
)
#
sampling_params = SamplingParams(
#
n=1,
#
temperature=0,
#
top_p=0.95,
#
max_tokens=2048,
#
)
sampling_params
=
SamplingParams
(
n
=
1
,
temperature
=
0
,
top_p
=
0.95
,
max_tokens
=
2048
,
)
# worker = partial(generate_worker, model_path=args.model, sampling_params=sampling_params)
# hint_responses = model_map(worker, hint_prompts, args.tp)
worker
=
partial
(
generate_worker
,
model_path
=
args
.
model
,
sampling_params
=
sampling_params
)
hint_responses
=
model_map
(
worker
,
hint_prompts
,
args
.
tp
)
pprint
.
pp
(
hint_responses
[
0
])
#
hints = [promptlib.postprocess_to_hint(x) for x in hint_responses]
#
#
hints: {"dataset"..., "task_id": ..., "solution_id": ..., "hints": ...}
hints
=
[
promptlib
.
postprocess_to_hint
(
x
)
for
x
in
hint_responses
]
# hints: {"dataset"..., "task_id": ..., "solution_id": ..., "hints": ...}
# save_jsonl(hints, args.output + ".hints")
hints
=
load_jsonl
(
args
.
output
+
".hints"
)
# save_jsonl(hint_responses, args.output + ".hint_responses")
save_jsonl
(
hints
,
args
.
output
+
".hints"
)
# hints = load_jsonl(args.output + ".hints")
hints_dict
=
defaultdict
(
dict
)
for
item
in
hints
:
...
...
@@ -87,6 +91,7 @@ if __name__ == "__main__":
worker
=
partial
(
generate_worker
,
model_path
=
args
.
model
,
sampling_params
=
sampling_params
)
reason_responses
=
model_map
(
worker
,
reason_prompts
,
args
.
tp
)
pprint
.
pp
(
reason_responses
[
0
])
save_jsonl
(
reason_responses
,
args
.
output
+
".reason"
)
# Step3 Verify reasoning results
...
...
@@ -116,6 +121,7 @@ if __name__ == "__main__":
worker
=
partial
(
generate_worker
,
model_path
=
args
.
model
,
sampling_params
=
sampling_params
)
verify_responses
=
model_map
(
worker
,
reason_responses
,
args
.
tp
)
pprint
.
pp
(
verify_responses
[
0
])
print
(
"verify response size: {}"
.
format
(
len
(
verify_responses
)))
# postprocess verify_response.
...
...
@@ -150,8 +156,7 @@ if __name__ == "__main__":
print
(
"Corrects (execution consistent) size: {}"
.
format
(
len
(
corrects
)))
print
(
"Incorrects (execution consistent) size: {}"
.
format
(
len
(
incorrects
)))
# Step4 Remove hints and Reformat to a SFT dataset
# extract reasoning sets
# Step4 Reformat to a SFT dataset
sft
=
[]
for
item
in
verify_passed
:
...
...
@@ -171,4 +176,26 @@ if __name__ == "__main__":
sft
.
append
(
line
)
print
(
"Size of sft dataset: {}"
.
format
(
len
(
sft
)))
pprint
.
pp
(
sft
[
0
])
save_jsonl
(
sft
,
args
.
output
)
# Step5 keep 1 rationale for 1 solution
task_solution_map
=
defaultdict
(
lambda
:
defaultdict
(
list
))
for
entry
in
sft
:
task_id
=
entry
[
"task_id"
]
solution_id
=
entry
[
"solution_id"
]
task_solution_map
[
task_id
][
solution_id
.
split
(
"_"
)[
0
]]
.
append
(
entry
)
# Step 2: Keep only one reasoning for each solution
processed_dataset
=
[]
for
task_id
,
solution_map
in
task_solution_map
.
items
():
for
solution
,
reasoning_list
in
solution_map
.
items
():
if
len
(
reasoning_list
)
>
1
:
selected_index
=
random
.
choice
(
range
(
1
,
len
(
reasoning_list
)))
processed_dataset
.
append
(
reasoning_list
[
selected_index
])
else
:
processed_dataset
.
append
(
reasoning_list
[
0
])
save_jsonl
(
processed_dataset
,
args
.
output
.
split
(
'.'
)[
0
]
+
"-filtered.jsonl"
)
codecritic/evaluation/apps_eval.py
View file @
2a43e44e
...
...
@@ -84,7 +84,7 @@ def evaluate_code_samples(code_samples, apps):
cpu_num
=
multiprocessing
.
cpu_count
()
//
2
chunksize
=
max
(
len
(
code_samples
)
//
(
cpu_num
*
10
),
1
)
results
=
process_map
(
test_generation
,
args
,
max_workers
=
cpu_num
,
chunksize
=
chunksize
test_generation
,
args
,
max_workers
=
cpu_num
,
chunksize
=
1
)
return
results
...
...
@@ -100,7 +100,7 @@ def evaluate(code_samples, apps):
The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result.
"""
all_results
=
[]
for
_
in
range
(
2
):
for
_
in
range
(
1
):
results
=
evaluate_code_samples
(
code_samples
,
apps
)
all_results
.
append
(
results
)
...
...
codecritic/evaluation/apps_exec.py
View file @
2a43e44e
# copy from codeparrot/apps_metric/testing_util.py
# https://huggingface.co/spaces/codeparrot/apps_metric/blob/main/testing_util.py
# Log: Replace pyext with importlib
# Log: Replace pyext with importlib
import
json
import
sys
...
...
@@ -66,7 +66,7 @@ def run_test(sample, test=None, debug=False):
"""
# Disable functionalities that can make destructive changes to the test.
reliability_guard
()
if
debug
:
print
(
f
"start = {datetime.now().time()}"
)
...
...
@@ -84,7 +84,7 @@ def run_test(sample, test=None, debug=False):
if
debug
:
print
(
f
"loaded input_output = {datetime.now().time()}"
)
if
test
is
None
:
return
in_outs
elif
test
is
not
None
:
...
...
@@ -92,7 +92,7 @@ def run_test(sample, test=None, debug=False):
sol
=
"import sys
\n
import time
\n
import itertools
\n
from itertools import accumulate, product, permutations, combinations
\n
import collections
\n
from collections import Counter, OrderedDict, deque, defaultdict, ChainMap
\n
from functools import lru_cache
\n
import math
\n
from math import sqrt, sin, cos, tan, ceil, fabs, floor, gcd, exp, log, log2
\n
import fractions
\n
from typing import List, Tuple
\n
import numpy as np
\n
import random
\n
import heapq
\n
from heapq import *
\n
"
if
debug
:
print
(
f
"loading test code = {datetime.now().time()}"
)
if
which_type
==
CODE_TYPE
.
call_based
:
sol
+=
test
if
debug
:
...
...
@@ -124,7 +124,7 @@ def run_test(sample, test=None, debug=False):
else
:
new_test
.
append
(
x
+
"
\n
"
)
tmp_test
=
new_test
new_test
=
""
started
=
False
for
i
in
tmp_test
:
...
...
@@ -133,7 +133,7 @@ def run_test(sample, test=None, debug=False):
new_test
+=
"def code():
\n
"
new_test
+=
i
started
=
True
elif
started
and
((
i
.
startswith
(
"from "
))
or
(
i
.
startswith
(
"import "
))):
elif
started
and
((
i
.
startswith
(
"from "
))
or
(
i
.
startswith
(
"import "
))):
new_test
+=
"
\t
"
+
i
else
:
new_test
+=
i
...
...
@@ -157,7 +157,7 @@ def run_test(sample, test=None, debug=False):
signal
.
alarm
(
0
)
if
debug
:
print
(
f
"get method = {datetime.now().time()}"
)
try
:
method
=
getattr
(
tmp
,
method_name
)
# get_attr second arg must be str
except
:
...
...
@@ -196,7 +196,7 @@ def run_test(sample, test=None, debug=False):
# ground truth sequences are not tuples
if
isinstance
(
output
,
tuple
):
output
=
list
(
output
)
tmp_result
=
output
==
in_outs
[
"outputs"
][
index
]
if
isinstance
(
in_outs
[
"outputs"
][
index
],
list
)
and
in_outs
[
"outputs"
][
index
]:
tmp_result
=
tmp_result
or
(
output
==
in_outs
[
"outputs"
][
index
][
0
])
...
...
@@ -278,7 +278,7 @@ def run_test(sample, test=None, debug=False):
print
(
f
"Failed check1 exception = {e}"
)
pass
if
tmp_result
==
True
:
if
tmp_result
==
True
:
results
.
append
(
tmp_result
)
continue
...
...
@@ -312,10 +312,10 @@ def run_test(sample, test=None, debug=False):
if
debug
:
nl
=
"
\n
"
if
not
isinstance
(
inputs
,
list
):
print
(
f
"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
)
print
(
f
"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
)
else
:
print
(
f
"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
)
print
(
f
"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
)
if
tmp_result
==
True
:
results
.
append
(
tmp_result
)
continue
...
...
@@ -350,9 +350,9 @@ def run_test(sample, test=None, debug=False):
# try by converting the stuff into split up list
if
isinstance
(
in_outs
[
"outputs"
][
index
],
list
):
for
tmp_index
,
i
in
enumerate
(
in_outs
[
"outputs"
][
index
]):
in_outs
[
"outputs"
][
index
][
tmp_index
]
=
se
t
(
i
.
split
())
in_outs
[
"outputs"
][
index
][
tmp_index
]
=
lis
t
(
i
.
split
())
else
:
in_outs
[
"outputs"
][
index
]
=
se
t
(
in_outs
[
"outputs"
][
index
]
.
split
())
in_outs
[
"outputs"
][
index
]
=
lis
t
(
in_outs
[
"outputs"
][
index
]
.
split
())
try
:
tmp_result
=
(
output
==
in_outs
[
"outputs"
][
index
])
...
...
@@ -363,7 +363,7 @@ def run_test(sample, test=None, debug=False):
if
tmp_result
==
True
:
results
.
append
(
tmp_result
)
continue
continue
# try by converting the output into a split up list too
if
isinstance
(
output
,
list
):
...
...
@@ -371,14 +371,14 @@ def run_test(sample, test=None, debug=False):
output
[
tmp_index
]
=
i
.
split
()
output
=
list
(
filter
(
len
,
output
))
for
tmp_index
,
i
in
enumerate
(
output
):
output
[
tmp_index
]
=
set
(
i
)
output
[
tmp_index
]
=
list
(
i
)
else
:
output
=
output
.
split
()
output
=
list
(
filter
(
len
,
output
))
output
=
se
t
(
output
)
output
=
lis
t
(
output
)
try
:
tmp_result
=
(
set
(
frozenset
(
s
)
for
s
in
output
)
==
set
(
frozense
t
(
s
)
for
s
in
in_outs
[
"outputs"
][
index
]))
tmp_result
=
(
list
(
list
(
s
)
for
s
in
output
)
==
list
(
lis
t
(
s
)
for
s
in
in_outs
[
"outputs"
][
index
]))
except
Exception
as
e
:
if
debug
:
print
(
f
"Failed check5 exception = {e}"
)
...
...
@@ -386,30 +386,30 @@ def run_test(sample, test=None, debug=False):
# if they are all numbers, round so that similar numbers are treated as identical
try
:
tmp_result
=
tmp_result
or
(
set
(
frozense
t
(
round
(
float
(
t
),
3
)
for
t
in
s
)
for
s
in
output
)
==
\
set
(
frozense
t
(
round
(
float
(
t
),
3
)
for
t
in
s
)
for
s
in
in_outs
[
"outputs"
][
index
]))
tmp_result
=
tmp_result
or
(
list
(
lis
t
(
round
(
float
(
t
),
3
)
for
t
in
s
)
for
s
in
output
)
==
\
list
(
lis
t
(
round
(
float
(
t
),
3
)
for
t
in
s
)
for
s
in
in_outs
[
"outputs"
][
index
]))
except
Exception
as
e
:
if
debug
:
print
(
f
"Failed check6 exception = {e}"
)
if
tmp_result
==
True
and
debug
:
print
(
"PASSED"
)
results
.
append
(
tmp_result
)
if
debug
:
nl
=
"
\n
"
if
not
isinstance
(
inputs
,
list
):
print
(
f
"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
)
else
:
print
(
f
"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
)
print
(
f
"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}"
)
return
results
def
custom_compare_
(
output
,
ground_truth
):
if
isinstance
(
output
,
list
):
output_1
=
"
\n
"
.
join
(
output
)
if
stripped_string_compare
(
output_1
,
ground_truth
):
...
...
@@ -451,7 +451,7 @@ def call_method(method, inputs):
pass
finally
:
pass
return
_inner_call_method
(
method
)
return
_inner_call_method
(
method
)
...
...
scripts/algolr.sh
View file @
2a43e44e
...
...
@@ -3,55 +3,56 @@ set -xe
model
=
"/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/"
project
=
"/lustre/S/nanziyuan/projects/ccc"
modelname
=
"qwen25_coder_inst"
data
=
"
${
project
}
/data"
trainset
=
"
${
project
}
/data
/train/
${
modelname
}
-apps-train.jsonl"
testset
=
"
${
project
}
/data
/test/
${
modelname
}
-apps-test.jsonl"
trainset
=
"
${
data
}
/train/
${
modelname
}
-apps-train.jsonl"
testset
=
"
${
data
}
/test/
${
modelname
}
-apps-test.jsonl"
train_selected_pairs
=
"
${
project
}
/data
/train/
${
modelname
}
-apps-train-selected_pairs.jsonl"
train_selected_pairs
=
"
${
data
}
/train/
${
modelname
}
-apps-train-selected_pairs.jsonl"
apps
=
"/lustre/S/nanziyuan/datasets/apps/"
sft
=
"
${
project
}
/data
/train/
${
modelname
}
-sft.jsonl"
sft
=
"
${
data
}
/train/
${
modelname
}
-sft.jsonl"
ftmodel
=
"
${
project
}
/model/qwen25_coder_inst_7b-algolr"
testset
=
"
/lustre/S/nanziyuan/projects/ccc/data
/test/qwen25_coder_inst-apps-test.jsonl"
evalresults
=
"
/lustre/S/nanziyuan/projects/ccc/data
/eval/qwen25_code_inst-apps-test-algolr-score.jsonl"
testset
=
"
${
data
}
/test/qwen25_coder_inst-apps-test.jsonl"
evalresults
=
"
${
data
}
/eval/qwen25_code_inst-apps-test-algolr-score.jsonl"
# export CUDA_VISIBLE_DEVICES=0,1,2,3
#
python -m codecritic.cli.algolr \
#
--model ${model} \
#
--dataset ${trainset} \
#
--pairinfo ${train_selected_pairs} \
#
--apps ${apps} \
#
--output ${sft} \
#
--level beginner \
#
--tp 1
#
deepspeed --module \
#
openrlhf.cli.train_sft \
#
--max_len 4096 \
#
--dataset ${sft} \
#
--input_key question \
#
--output_key response \
#
--apply_chat_template \
#
--train_batch_size 256 \
#
--micro_train_batch_size 2 \
#
--max_samples 500000 \
#
--pretrain ${model} \
#
--save_path ${ftmodel} \
#
--save_steps -1 \
#
--logging_steps 1 \
#
--eval_steps -1 \
#
--zero_stage 2 \
#
--max_epochs 1 \
#
--bf16 \
#
--flash_attn \
#
--learning_rate 5e-6 \
#
--load_checkpoint \
#
--gradient_checkpointing \
#
--use_tensorboard "${ftmodel}_log"
python
-m
codecritic.cli.algolr
\
--model
${
model
}
\
--dataset
${
trainset
}
\
--pairinfo
${
train_selected_pairs
}
\
--apps
${
apps
}
\
--output
${
sft
}
\
--level
beginner
\
--tp
1
deepspeed
--module
\
openrlhf.cli.train_sft
\
--max_len
4096
\
--dataset
${
sft
}
\
--input_key
question
\
--output_key
response
\
--apply_chat_template
\
--train_batch_size
256
\
--micro_train_batch_size
2
\
--max_samples
500000
\
--pretrain
${
model
}
\
--save_path
${
ftmodel
}
\
--save_steps
-1
\
--logging_steps
1
\
--eval_steps
-1
\
--zero_stage
2
\
--max_epochs
1
\
--bf16
\
--flash_attn
\
--learning_rate
5e-6
\
--load_checkpoint
\
--gradient_checkpointing
\
--use_tensorboard
"
${
ftmodel
}
_log"
python
-m
codecritic.cli.test_genrm
\
...
...
scripts/gen_dataset.sh
View file @
2a43e44e
...
...
@@ -12,7 +12,7 @@ train_selected_pairs="${project}/data/train/${modelname}-apps-train-selected_pai
reward_ds
=
"
${
project
}
/data/train/
${
modelname
}
-apps-train-reward_dataset.jsonl"
export
CUDA_VISIBLE_DEVICES
=
0,1,2,3
#
export CUDA_VISIBLE_DEVICES=0,1,2,3
## Sampling
## APPS
...
...
scripts/train_orm.sh
View file @
2a43e44e
...
...
@@ -9,29 +9,29 @@ ftmodel="${project}/model/qwen25_coder_inst_7b-orm"
testset
=
"/lustre/S/nanziyuan/projects/ccc/data/test/qwen25_coder_inst-apps-test.jsonl"
evalresults
=
"/lustre/S/nanziyuan/projects/ccc/data/eval/qwen25_code_inst-apps-test-orm-score.jsonl"
#
deepspeed --module \
#
openrlhf.cli.train_rm \
#
--save_path ${ftmodel} \
#
--save_steps -1 \
#
--logging_steps 1 \
#
--eval_steps -1 \
#
--train_batch_size 256 \
#
--micro_train_batch_size 1 \
#
--pretrain ${model} \
#
--bf16 \
#
--max_epochs 1 \
#
--max_len 8192 \
#
--zero_stage 3 \
#
--learning_rate 9e-6 \
#
--dataset ${dataset} \
#
--apply_chat_template \
#
--prompt_key messages \
#
--chosen_key chosen \
#
--rejected_key rejected \
#
--flash_attn \
#
--load_checkpoint \
#
--gradient_checkpointing \
#
--use_tensorboard "${ftmodel}_log"
deepspeed
--module
\
openrlhf.cli.train_rm
\
--save_path
${
ftmodel
}
\
--save_steps
-1
\
--logging_steps
1
\
--eval_steps
-1
\
--train_batch_size
256
\
--micro_train_batch_size
1
\
--pretrain
${
model
}
\
--bf16
\
--max_epochs
1
\
--max_len
8192
\
--zero_stage
3
\
--learning_rate
9e-6
\
--dataset
${
dataset
}
\
--apply_chat_template
\
--prompt_key
messages
\
--chosen_key
chosen
\
--rejected_key
rejected
\
--flash_attn
\
--load_checkpoint
\
--gradient_checkpointing
\
--use_tensorboard
"
${
ftmodel
}
_log"
start_server
()
{
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment