Commit ef55d00d by nanziyuan

test r1

parent 0169b342
import argparse
from collections import defaultdict
import json
from functools import partial
import codecritic.evaluation.metric as metric
from codecritic.utils.json import load_jsonl
def confidence(item):
sign = 1 if item["prediction"] else -1
return sign * item["confidence"]
def eval(scores):
ks = list(range(1, 17))
results = []
# results.extend(metric.pass_at_k(scores, ks))
# results.extend(metric.pass_at_k(scores, [50]))
results.extend(metric.top_at_k(scores, ks, confidence))
# if "negative_score" in scores[0]:
# results.extend(metric.top_at_k(scores, ks, metric.postive_and_negative))
# for i in range(4):
# threshold = 0.5 + i * 0.1
# score_func = partial(metric.pos_neg_filter_uncertain, threshold=threshold)
# results.extend(metric.top_at_k(scores, ks, score_func))
return results
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--score", type=str, help="path/to/score")
args = parser.parse_args()
scores = load_jsonl(args.score)
groups = defaultdict(list)
for item in scores:
groups[item["dataset"]].append(item)
for dataset, lst in groups.items():
results = eval(lst)
for r in results:
r["dataset"] = dataset
r["strategy"] = "r1_qwen_7b"
del r["score_func"]
print(json.dumps(r))
import argparse
from collections import defaultdict
from functools import partial
from openai import OpenAI
from concurrent.futures import ThreadPoolExecutor
from codecritic.utils.json import load_jsonl, save_jsonl
client = OpenAI(
base_url="http://localhost:8000/v1",
api_key="token-abc123",
)
def chat(messages, model):
completion = client.chat.completions.create(
model=model,
messages=messages,
max_tokens=8192,
temperature=0,
)
response = completion.choices[0].message.content
return response
def load_dataset(test_path):
raw_dataset = load_jsonl(test_path)
task_dict = defaultdict(list)
for item in raw_dataset:
task_dict[item["task_id"]].append(item)
unsolvable, dataset = [], []
for _, items in task_dict.items():
if all([not x["pass"] for x in items]):
for item in items:
item["confidence"] = 0
item["prediction"] = None
unsolvable.extend(items)
else:
dataset.extend(items)
return dataset, unsolvable
prompt_template = """
Please verify if the following code correctly solves this question:
Question: {question}
Code:
{code}
Please provide:
1. Your judgment (True if the code is correct, False if it's not)
2. Your confidence (a float number between 0.00 and 1.00, where 0.00 means you have no idea about the solution and are purely guessing, and 1.00 means you are absolutely certain your solution is correct)
Format your response exactly like this:
Judgment: [True/False]
Confidence: [0.00-1.00]
""".strip()
def preprocess_prompt(item):
question = item["messages"][0]["content"]
code = item["code"]
prompt = prompt_template.format(question=question, code=code)
return [{"role": "user", "content": prompt}]
def postprocess_response(response):
try:
# Skip the thinking process if present
if '<think>' in response:
response = response.split('</think>')[-1].strip()
# Extract judgment and confidence using string parsing
lines = response.strip().split('\n')
judgment = None
confidence = None
for line in lines:
line = line.strip('*')
if line.lower().startswith('judgment:'):
judgment_str = line.split(':', 1)[1].strip().lower()
if judgment_str in ['true', 'false']:
judgment = judgment_str == 'true'
elif line.lower().startswith('confidence:'):
confidence_str = line.split(':', 1)[1].strip()
try:
confidence = float(confidence_str)
# Ensure confidence is between 0 and 1
confidence = max(0.0, min(1.0, confidence))
except ValueError:
confidence = None
return judgment, confidence
except Exception as e:
# Return default values in case of any error
print(e)
return None, None
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="path/to/model")
parser.add_argument("--testset", type=str, help="path/to/testset")
parser.add_argument("--output", type=str, help="path/to/score")
args = parser.parse_args()
chat_fun = partial(chat, model=args.model)
dataset, unsolvable = load_dataset(args.testset)
# dataset = dataset[:4]
prompts = list(map(preprocess_prompt, dataset))
from tqdm.contrib.concurrent import thread_map
#with ThreadPoolExecutor(max_workers=4) as executor:
# responses = executor.map(chat_fun, prompts)
responses = thread_map(chat_fun, prompts, max_workers=4)
for item, response in zip(dataset, responses):
judgement, confidence = postprocess_response(response)
item["prediction"] = judgement
item["confidence"] = confidence
item["response"] = response
save_jsonl(dataset + unsolvable, args.output)
...@@ -15,7 +15,6 @@ apps="/lustre/S/nanziyuan/datasets/apps/" ...@@ -15,7 +15,6 @@ apps="/lustre/S/nanziyuan/datasets/apps/"
sft="${data}/train/qwen25_coder_inst-sft-balanced.jsonl" sft="${data}/train/qwen25_coder_inst-sft-balanced.jsonl"
ftmodel="${project}/model/qwen25_coder_inst_7b-algolr_balance_epoch3_bs32" ftmodel="${project}/model/qwen25_coder_inst_7b-algolr_balance_epoch3_bs32"
testset="${data}/test/qwen25_coder_inst-apps-test.jsonl"
evalresults="${data}/eval/qwen25_code_inst-apps-test-algolr-balance_epoch3_bs32.jsonl" evalresults="${data}/eval/qwen25_code_inst-apps-test-algolr-balance_epoch3_bs32.jsonl"
# export CUDA_VISIBLE_DEVICES=0,1,2,3 # export CUDA_VISIBLE_DEVICES=0,1,2,3
......
llm_kit:
model: /share/collab/codemodel/models/DeepSeek-R1-Distill-Qwen-7B/
host: 0.0.0.0
router_port: 8000
tensor_parallel_size: 1
pipeline_parallel_size: 1
data_parallel_size: 4
router_timeout: 1200
random_seeds:
- 1111
- 2222
- 3333
- 4444
- 5555
- 6666
- 7777
- 8888
vllm:
api_key: token-abc123
dtype: auto
...@@ -6,11 +6,46 @@ data="${project}/data" ...@@ -6,11 +6,46 @@ data="${project}/data"
modelname="qwen25_coder_inst" modelname="qwen25_coder_inst"
trainset="${data}/train/${modelname}-apps-train.jsonl" trainset="${data}/train/${modelname}-apps-train.jsonl"
testset="${data}/test/${modelname}-apps-test.jsonl"
train_selected_pairs="${data}/train/${modelname}-apps-train-selected_pairs.jsonl" train_selected_pairs="${data}/train/${modelname}-apps-train-selected_pairs.jsonl"
distill="${data}/train/${modelname}-apps-distillation-deepseekv3.jsonl" distill="${data}/train/${modelname}-apps-distillation-deepseekv3.jsonl"
python -m codecritic.cli.distill \ ftmodel="${project}/model/${modelname}-apps-distillation_bs32_epoch10"
--dataset ${trainset} \ evalresults="${data}/eval/${modelname}-apps-test-distillation-bs32-epoch10.jsonl"
--pairinfo ${train_selected_pairs} \
--output ${distill} # python -m codecritic.cli.distill \
# --dataset ${trainset} \
# --pairinfo ${train_selected_pairs} \
# --output ${distill}
deepspeed --module \
openrlhf.cli.train_sft \
--max_len 4096 \
--dataset ${distill} \
--input_key question \
--output_key response \
--apply_chat_template \
--train_batch_size 32 \
--micro_train_batch_size 1 \
--max_samples 500000 \
--pretrain ${model} \
--save_path ${ftmodel} \
--save_steps -1 \
--logging_steps 1 \
--eval_steps -1 \
--zero_stage 2 \
--max_epochs 10 \
--bf16 \
--flash_attn \
--learning_rate 5e-6 \
--gradient_checkpointing \
--use_tensorboard "${ftmodel}_log"
python -m codecritic.cli.test_genrm \
--model ${ftmodel} \
--testset ${testset} \
--output ${evalresults} \
--reasoning \
--tp 1
set -xe
timestamp="20250115_002719"
# timestamp=$(date +"%Y%m%d_%H%M%S")
model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-32B-Instruct"
project="/lustre/S/nanziyuan/projects/ccc"
modelname="qwen25_coder_inst_32b_${timestamp}"
tp=4
apps="/lustre/S/nanziyuan/datasets/apps/"
data="${project}/data"
trainset="${data}/train/${modelname}-apps-train.jsonl"
testset="${data}/test/${modelname}-apps-test.jsonl"
train_selected_pairs="${project}/data/train/${modelname}-apps-train-selected_pairs.jsonl"
reward_ds="${project}/data/train/${modelname}-apps-train-reward_dataset.jsonl"
# python -m codecritic.cli.gen_dataset \
# --model ${model} \
# --apps ${apps} \
# --train ${trainset} \
# --test ${testset} \
# --tp ${tp}
# python -m codecritic.cli.select_preference_pairs \
# --dataset ${trainset} \
# --output ${train_selected_pairs}
# python -m codecritic.cli.reformat \
# --dataset ${trainset} \
# --pairs ${train_selected_pairs} \
# --format reward \
# --output ${reward_ds}
sft="${data}/train/${modelname}-sft.jsonl"
ftmodel="${project}/model/${modelname}-algolr"
algolrscore="${data}/eval/${modelname}-apps-test-base-score.jsonl"
# python -m codecritic.cli.algolr \
# --model ${model} \
# --dataset ${trainset} \
# --pairinfo ${train_selected_pairs} \
# --apps ${apps} \
# --output ${sft} \
# --level beginner \
# --tp ${tp}
# deepspeed --module \
# openrlhf.cli.train_sft \
# --max_len 5120 \
# --dataset ${sft} \
# --input_key question \
# --output_key response \
# --apply_chat_template \
# --train_batch_size 32 \
# --micro_train_batch_size 1 \
# --max_samples 500000 \
# --pretrain ${model} \
# --save_path ${ftmodel} \
# --save_steps -1 \
# --logging_steps 1 \
# --eval_steps -1 \
# --zero_stage 3 \
# --max_epochs 1 \
# --bf16 \
# --flash_attn \
# --learning_rate 5e-6 \
# --load_checkpoint \
# --gradient_checkpointing \
# --adam_offload \
# --use_tensorboard "${ftmodel}_log"
python -m codecritic.cli.test_genrm \
--model ${model} \
--testset ${testset} \
--output ${algolrscore} \
--reasoning \
--tp ${tp}
# # ORM
# ormmodel="${project}/model/${modelname}-orm"
# ormscore="${data}/eval/${$modelname}-apps-test-orm-score.jsonl"
# deepspeed --module \
# openrlhf.cli.train_rm \
# --save_path ${ormmodel} \
# --save_steps -1 \
# --logging_steps 1 \
# --eval_steps -1 \
# --train_batch_size 128 \
# --micro_train_batch_size 1 \
# --pretrain ${model} \
# --bf16 \
# --max_epochs 1 \
# --max_len 8192 \
# --zero_stage 3 \
# --beta 0.1 \
# --learning_rate 9e-6 \
# --dataset ${reward_ds} \
# --apply_chat_template \
# --prompt_key messages \
# --chosen_key chosen \
# --rejected_key rejected \
# --flash_attn \
# --gradient_checkpointing \
# --adam_offload \
# --use_tensorboard "${ormmodel}_log"
# start_server() {
# echo "Starting server..."
# CUDA_VISIBLE_DEVICES=0,1,2,3 \
# python -m openrlhf.cli.serve_rm \
# --reward_pretrain ${ormmodel} \
# --normalize_reward \
# --port 5000 \
# --bf16 \
# --max_len 8192 &
# SERVER_PID=$!
# echo "Server started with PID: $SERVER_PID"
# }
# # Function to start the client
# start_client() {
# echo "Starting client..."
# python -m codecritic.cli.test_orm \
# --model ${ftmodel} \
# --testset ${testset} \
# --output ${ormscore}
# CLIENT_EXIT_CODE=$?
# echo "Client finished with exit code: $CLIENT_EXIT_CODE"
# }
# # Function to stop the server
# stop_server() {
# echo "Stopping server..."
# kill -SIGINT $SERVER_PID
# wait $SERVER_PID 2>/dev/null
# echo "Server stopped."
# }
# start_server
# # Give the server some time to initialize (optional)
# sleep 600
# start_client
# stop_server
# echo "Execution complete."
...@@ -15,22 +15,22 @@ testset="${data}/test/${modelname}-apps-test.jsonl" ...@@ -15,22 +15,22 @@ testset="${data}/test/${modelname}-apps-test.jsonl"
train_selected_pairs="${project}/data/train/${modelname}-apps-train-selected_pairs.jsonl" train_selected_pairs="${project}/data/train/${modelname}-apps-train-selected_pairs.jsonl"
reward_ds="${project}/data/train/${modelname}-apps-train-reward_dataset.jsonl" reward_ds="${project}/data/train/${modelname}-apps-train-reward_dataset.jsonl"
python -m codecritic.cli.gen_dataset \ # python -m codecritic.cli.gen_dataset \
--model ${model} \ # --model ${model} \
--apps ${apps} \ # --apps ${apps} \
--train ${trainset} \ # --train ${trainset} \
--test ${testset} \ # --test ${testset} \
--tp ${tp} # --tp ${tp}
python -m codecritic.cli.select_preference_pairs \ # python -m codecritic.cli.select_preference_pairs \
--dataset ${trainset} \ # --dataset ${trainset} \
--output ${train_selected_pairs} # --output ${train_selected_pairs}
python -m codecritic.cli.reformat \ # python -m codecritic.cli.reformat \
--dataset ${trainset} \ # --dataset ${trainset} \
--pairs ${train_selected_pairs} \ # --pairs ${train_selected_pairs} \
--format reward \ # --format reward \
--output ${reward_ds} # --output ${reward_ds}
# sft="${data}/train/${modelname}-sft.jsonl" # sft="${data}/train/${modelname}-sft.jsonl"
...@@ -82,8 +82,8 @@ python -m codecritic.cli.reformat \ ...@@ -82,8 +82,8 @@ python -m codecritic.cli.reformat \
# --tp 1 # --tp 1
# # ORM # # ORM
# ormmodel="${project}/model/${modelname}-orm" ormmodel="${project}/model/${modelname}-orm"
# ormscore="${data}/eval/${$modelname}-apps-test-orm-score.jsonl" ormscore="${data}/eval/${modelname}-apps-test-orm-score.jsonl"
# deepspeed --module \ # deepspeed --module \
# openrlhf.cli.train_rm \ # openrlhf.cli.train_rm \
...@@ -98,7 +98,6 @@ python -m codecritic.cli.reformat \ ...@@ -98,7 +98,6 @@ python -m codecritic.cli.reformat \
# --max_epochs 1 \ # --max_epochs 1 \
# --max_len 8192 \ # --max_len 8192 \
# --zero_stage 3 \ # --zero_stage 3 \
# --beta 0.1 \
# --learning_rate 9e-6 \ # --learning_rate 9e-6 \
# --dataset ${reward_ds} \ # --dataset ${reward_ds} \
# --apply_chat_template \ # --apply_chat_template \
...@@ -111,41 +110,41 @@ python -m codecritic.cli.reformat \ ...@@ -111,41 +110,41 @@ python -m codecritic.cli.reformat \
# --use_tensorboard "${ormmodel}_log" # --use_tensorboard "${ormmodel}_log"
# start_server() { start_server() {
# echo "Starting server..." echo "Starting server..."
# CUDA_VISIBLE_DEVICES=0,1,2,3 \ CUDA_VISIBLE_DEVICES=0 \
# python -m openrlhf.cli.serve_rm \ python -m openrlhf.cli.serve_rm \
# --reward_pretrain ${ormmodel} \ --reward_pretrain ${ormmodel} \
# --normalize_reward \ --normalize_reward \
# --port 5000 \ --port 5000 \
# --bf16 \ --bf16 \
# --max_len 8192 & --max_len 8192 &
# SERVER_PID=$! SERVER_PID=$!
# echo "Server started with PID: $SERVER_PID" echo "Server started with PID: $SERVER_PID"
# } }
# # Function to start the client # Function to start the client
# start_client() { start_client() {
# echo "Starting client..." echo "Starting client..."
# python -m codecritic.cli.test_orm \ python -m codecritic.cli.test_orm \
# --model ${ftmodel} \ --model ${ormmodel} \
# --testset ${testset} \ --testset ${testset} \
# --output ${ormscore} --output ${ormscore}
# CLIENT_EXIT_CODE=$? CLIENT_EXIT_CODE=$?
# echo "Client finished with exit code: $CLIENT_EXIT_CODE" echo "Client finished with exit code: $CLIENT_EXIT_CODE"
# } }
# # Function to stop the server # Function to stop the server
# stop_server() { stop_server() {
# echo "Stopping server..." echo "Stopping server..."
# kill -SIGINT $SERVER_PID kill -SIGINT $SERVER_PID
# wait $SERVER_PID 2>/dev/null wait $SERVER_PID 2>/dev/null
# echo "Server stopped." echo "Server stopped."
# } }
# start_server start_server
# # Give the server some time to initialize (optional) # Give the server some time to initialize (optional)
# sleep 600 sleep 600
# start_client start_client
# stop_server stop_server
# echo "Execution complete." echo "Execution complete."
set -xe
model="/share/collab/codemodel/models/DeepSeek-R1-Distill-Qwen-7B/"
data="/nfs_global/S/nanziyuan/projects/ccc/data"
testset="${data}/test/qwen25_coder_inst-apps-test.jsonl"
evalresults="${data}/eval/qwen25_code_inst-apps-test-r1_7b_test.jsonl"
# python -m llmkit_data.cli.serve --config /nfs_global/S/nanziyuan/projects/ccc/src/scripts/config.yaml &
# vllm serve ${model} --max_model 12288
# sleep 300s
python -m codecritic.cli.test_r1 \
--model ${model} \
--testset ${testset} \
--output ${evalresults}
+ model=/share/collab/codemodel/models/DeepSeek-R1-Distill-Qwen-7B/
+ data=/nfs_global/S/nanziyuan/projects/ccc/data
+ testset=/nfs_global/S/nanziyuan/projects/ccc/data/test/qwen25_coder_inst-apps-test.jsonl
+ evalresults=/nfs_global/S/nanziyuan/projects/ccc/data/eval/qwen25_code_inst-apps-test-r1_7b.jsonl
+ echo hello
+ python -m codecritic.cli.test_r1 --model /share/collab/codemodel/models/DeepSeek-R1-Distill-Qwen-7B/ --testset /nfs_global/S/nanziyuan/projects/ccc/data/test/qwen25_coder_inst-apps-test.jsonl --output /nfs_global/S/nanziyuan/projects/ccc/data/eval/qwen25_code_inst-apps-test-r1_7b.jsonl
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment