Commit 0169b342 by nanziyuan

Too many things.

parent 2a43e44e
......@@ -177,7 +177,7 @@ if __name__ == "__main__":
print("Size of sft dataset: {}".format(len(sft)))
pprint.pp(sft[0])
save_jsonl(sft, args.output)
save_jsonl(sft, args.output + '.unfiltered')
# Step5 keep 1 rationale for 1 solution
task_solution_map = defaultdict(lambda: defaultdict(list))
......@@ -198,4 +198,4 @@ if __name__ == "__main__":
else:
processed_dataset.append(reasoning_list[0])
save_jsonl(processed_dataset, args.output.split('.')[0] + "-filtered.jsonl")
save_jsonl(processed_dataset, args.output)
import argparse
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor
import pprint
from pathlib import Path
from openai import OpenAI
from codecritic.utils.json import load_jsonl, save_jsonl
import codecritic.dataset.distill_prompt as promptlib
client = OpenAI(api_key="sk-36862826208b48c68c789746fc98de9b", base_url="https://api.deepseek.com")
def generate_completion(prompt):
response = client.chat.completions.create(
model="deepseek-chat",
messages=prompt["gen_prompt"],
max_tokens=1024,
temperature=0.7,
stream=False
)
response_content = response.choices[0].message.content
prompt["cot"] = response_content
return prompt
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--dataset", type=str, help="path/to/sample")
parser.add_argument("--pairinfo", type=str, help="path/to/pairinfo")
parser.add_argument("--output", type=str, help="path/to/score")
args = parser.parse_args()
dataset = load_jsonl(args.dataset)
pairinfo = load_jsonl(args.pairinfo)
ds = defaultdict(dict)
for item in dataset:
ds[item["task_id"]][item["solution_id"]] = item
verify_prompts = []
for pair in pairinfo:
task_id, chosen_id, rejected_id = pair["task_id"], pair["chosen"], pair["rejected"]
chosen, rejected = ds[task_id][chosen_id], ds[task_id][rejected_id]
prompts = promptlib.mk_distillation_messages(chosen, rejected)
verify_prompts.extend(prompts)
pprint.pp(verify_prompts[:2])
# verify_prompts = verify_prompts[:8]
raw_response_path = Path(args.output + ".raw_response")
if raw_response_path.exists():
generated_responses = load_jsonl(raw_response_path)
verify_prompts = verify_prompts[len(generated_responses):]
else:
generated_responses = []
print("generated:", len(generated_responses), "rest", len(verify_prompts))
with ThreadPoolExecutor(max_workers=8) as executor:
raw_responses = list(executor.map(generate_completion, verify_prompts))
generated_responses.extend(raw_responses)
save_jsonl(generated_responses, args.output + ".raw_response")
outputs = []
for res in generated_responses:
is_valid, clean_response, verification = promptlib.postprocess_result(res["cot"])
print(is_valid, verification)
if is_valid and (verification == res["pass"]):
row = {
"task_id": res["task_id"],
"solution_id": res["solution_id"],
"question": res["train_prompt"],
"response": [{"role": "user", "content": clean_response}]
}
outputs.append(row)
save_jsonl(outputs, args.output)
......@@ -10,9 +10,9 @@ def eval(scores):
ks = list(range(1, 17))
results = []
# results.extend(metric.pass_at_k(scores, ks))
results.extend(metric.pass_at_k(scores, ks))
# results.extend(metric.pass_at_k(scores, [50]))
# results.extend(metric.top_at_k(scores, ks, metric.positive_only))
results.extend(metric.top_at_k(scores, ks, metric.positive_only))
if "negative_score" in scores[0]:
results.extend(metric.top_at_k(scores, ks, metric.postive_and_negative))
......
......@@ -112,6 +112,7 @@ if __name__ == "__main__":
trainset = load_jsonl(train_raw_path)
testset = load_jsonl(test_raw_path)
print("Start evaluation")
trainset = evaluate(trainset, apps)
testset = evaluate(testset, apps)
......
......@@ -10,7 +10,7 @@ from codecritic.dataset.genrm_prompt import JUDGE_MESSAGE, JUDGE_TOEKNS
from codecritic.utils.inference import generate_worker, score_worker
from codecritic.utils.parallel import model_map
from codecritic.utils.json import load_jsonl, save_jsonl
from codecritic.evaluation.metric import postive_and_negative, binary_metrics
from codecritic.evaluation.metric import postive_and_negative, binary_metrics, positive_only
if __name__ == "__main__":
......@@ -32,7 +32,7 @@ if __name__ == "__main__":
os.environ["TOKENIZERS_PARALLELISM"] = "false"
tokenizer = AutoTokenizer.from_pretrained(args.model)
dataset = load_jsonl(args.trainset)[:1000]
dataset = load_jsonl(args.trainset)
for item in dataset:
item["messages"] = item["question"]
......@@ -69,7 +69,7 @@ if __name__ == "__main__":
)
dataset = model_map(worker, dataset, args.tp)
scores = [postive_and_negative(item) for item in dataset]
scores = [positive_only(item) for item in dataset]
labels = [item["pass"] for item in dataset]
pprint.pp(binary_metrics(labels, scores))
......
COT_GENERATION_PROMPT = """
You are a programming teacher. Grade the **Solution**, verifying its correctness step by step. Use the **Expected Answer** to identify any erroneous steps or logic in the **Solution**. At the end of the verification, when you give your final grade, write it in the form:
**"Verification: Is the code correct (Yes/No)? X"**, where **X** is either **Yes** or **No**.
**Question:**
{problem}
**Solution:**
{solution}
**Expected Answer:**
{ground_truth}
"""
TRAIN_PROMPT = """
You are a programming teacher. Grade the **Solution**, verifying its correctness step by step.
**Question:**
{problem}
**Solution:**
{solution}
"""
def mk_distillation_messages(chosen, rejected):
question = chosen["messages"][0]["content"]
chosen_answer = chosen["code"]
rejected_answer = rejected["code"]
chosen_prompt = COT_GENERATION_PROMPT.format(
problem=question,
solution=chosen_answer,
ground_truth="None"
)
rejected_prompt = COT_GENERATION_PROMPT.format(
problem=question,
solution=rejected_answer,
ground_truth=chosen_answer
)
chosen["gen_prompt"] = [{"role": "user", "content": chosen_prompt}]
rejected["gen_prompt"] = [{"role": "user", "content": rejected_prompt}]
chosen_train_prompt = TRAIN_PROMPT.format(
problem=question,
solution=chosen_answer
)
rejected_train_prompt = TRAIN_PROMPT.format(
problem=question,
solution=rejected_answer
)
chosen["train_prompt"] = [{"role": "user", "content": chosen_train_prompt}]
rejected["train_prompt"] = [{"role": "user", "content": rejected_train_prompt}]
return [chosen, rejected]
def postprocess_result(response):
"""
-> (valid_response: bool, cleaned_response: str, verification: bool)
"""
verification_prefix = "Verification: Is the code correct"
lines = response.splitlines()
# Search for the verification line
verification_line = None
idx = len(lines) - 1
for idx in range(len(lines) - 1, -1, -1):
strip_line = lines[idx].strip()
if verification_prefix in strip_line:
verification_line = strip_line
break
if verification_line:
cleaned_response = "\n".join(lines[:idx])
parts = verification_line.split('?')
if len(parts) > 1:
answer = parts[1].strip().lower() # Get the part after '?'
if "yes" in answer:
return True, cleaned_response, True
elif "no" in answer:
return True, cleaned_response, False
else:
# If the answer is neither "Yes" nor "No", assume incorrect
return False, cleaned_response, False
else:
# If there's no '?', assume incorrect
return False, cleaned_response, False
else:
# If no verification line is found, assume the result is incorrect
return False, response, False
......@@ -82,6 +82,7 @@ def evaluate_code_samples(code_samples, apps):
args.append((apps[split][int(idx)], sample))
cpu_num = multiprocessing.cpu_count() // 2
print(f"Using {cpu_num} cpu")
chunksize = max(len(code_samples) // (cpu_num * 10), 1)
results = process_map(
test_generation, args, max_workers=cpu_num, chunksize=1
......
......@@ -99,3 +99,40 @@ def auroc(samples, score_func):
fpr, tpr, thresholds = metrics.roc_curve(y, pred)
roc_auc = metrics.auc(fpr, tpr)
return roc_auc, fpr, tpr
def binary_metrics(labels, scores, threshold=0.5):
# Initialize counters
tn = tp = fn = fp = 0
# Iterate over labels and scores
for label, score in zip(labels, scores):
predicted_label = score >= threshold
if label:
if predicted_label:
tp += 1
else:
fn += 1
else:
if predicted_label:
fp += 1
else:
tn += 1
# Calculate metrics
accuracy = (tp + tn) / (tp + tn + fp + fn) if (tp + tn + fp + fn) != 0 else 0
precision = tp / (tp + fp) if (tp + fp) != 0 else 0
recall = tp / (tp + fn) if (tp + fn) != 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0
# Return results as a dictionary
return {
'true_negatives': tn,
'true_positives': tp,
'false_negatives': fn,
'false_positives': fp,
'accuracy': accuracy,
'precision': precision,
'recall': recall,
'f1_score': f1_score
}
......@@ -11,46 +11,46 @@ testset="${data}/test/${modelname}-apps-test.jsonl"
train_selected_pairs="${data}/train/${modelname}-apps-train-selected_pairs.jsonl"
apps="/lustre/S/nanziyuan/datasets/apps/"
sft="${data}/train/${modelname}-sft.jsonl"
# sft="${data}/train/${modelname}-sft.jsonl"
sft="${data}/train/qwen25_coder_inst-sft-balanced.jsonl"
ftmodel="${project}/model/qwen25_coder_inst_7b-algolr"
ftmodel="${project}/model/qwen25_coder_inst_7b-algolr_balance_epoch3_bs32"
testset="${data}/test/qwen25_coder_inst-apps-test.jsonl"
evalresults="${data}/eval/qwen25_code_inst-apps-test-algolr-score.jsonl"
evalresults="${data}/eval/qwen25_code_inst-apps-test-algolr-balance_epoch3_bs32.jsonl"
# export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m codecritic.cli.algolr \
--model ${model} \
--dataset ${trainset} \
--pairinfo ${train_selected_pairs} \
--apps ${apps} \
--output ${sft} \
--level beginner \
--tp 1
# python -m codecritic.cli.algolr \
# --model ${model} \
# --dataset ${trainset} \
# --pairinfo ${train_selected_pairs} \
# --apps ${apps} \
# --output ${sft} \
# --level beginner \
# --tp 1
deepspeed --module \
openrlhf.cli.train_sft \
--max_len 4096 \
--max_len 5632 \
--dataset ${sft} \
--input_key question \
--output_key response \
--apply_chat_template \
--train_batch_size 256 \
--micro_train_batch_size 2 \
--train_batch_size 32 \
--micro_train_batch_size 1 \
--max_samples 500000 \
--pretrain ${model} \
--save_path ${ftmodel} \
--save_steps -1 \
--logging_steps 1 \
--eval_steps -1 \
--zero_stage 2 \
--max_epochs 1 \
--zero_stage 3 \
--max_epochs 3 \
--bf16 \
--flash_attn \
--learning_rate 5e-6 \
--load_checkpoint \
--gradient_checkpointing \
--use_tensorboard "${ftmodel}_log"
......
set -xe
model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/"
project="/lustre/S/nanziyuan/projects/ccc"
data="${project}/data"
modelname="qwen25_coder_inst"
trainset="${data}/train/${modelname}-apps-train.jsonl"
train_selected_pairs="${data}/train/${modelname}-apps-train-selected_pairs.jsonl"
distill="${data}/train/${modelname}-apps-distillation-deepseekv3.jsonl"
python -m codecritic.cli.distill \
--dataset ${trainset} \
--pairinfo ${train_selected_pairs} \
--output ${distill}
set -xe
model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-32B-Instruct"
project="/lustre/S/nanziyuan/projects/ccc"
modelname="qwen25_coder_inst_32b"
tp=4
apps="/lustre/S/nanziyuan/datasets/apps/"
data="${project}/data"
trainset="${data}/train/${modelname}-apps-train.jsonl"
testset="${data}/test/${modelname}-apps-test.jsonl"
train_selected_pairs="${project}/data/train/${modelname}-apps-train-selected_pairs.jsonl"
reward_ds="${project}/data/train/${modelname}-apps-train-reward_dataset.jsonl"
python -m codecritic.cli.gen_dataset \
--model ${model} \
--apps ${apps} \
--train ${trainset} \
--test ${testset} \
--tp ${tp}
python -m codecritic.cli.select_preference_pairs \
--dataset ${trainset} \
--output ${train_selected_pairs}
python -m codecritic.cli.reformat \
--dataset ${trainset} \
--pairs ${train_selected_pairs} \
--format reward \
--output ${reward_ds}
sft="${data}/train/${modelname}-sft.jsonl"
ftmodel="${project}/model/${modelname}-algolr"
algolrscore="${data}/eval/${$modelname}-apps-test-algolr-score.jsonl"
python -m codecritic.cli.algolr \
--model ${model} \
--dataset ${trainset} \
--pairinfo ${train_selected_pairs} \
--apps ${apps} \
--output ${sft} \
--level beginner \
--tp ${tp}
# deepspeed --module \
# openrlhf.cli.train_sft \
# --max_len 4096 \
# --dataset ${sft} \
# --input_key question \
# --output_key response \
# --apply_chat_template \
# --train_batch_size 128 \
# --micro_train_batch_size 1 \
# --max_samples 500000 \
# --pretrain ${model} \
# --save_path ${ftmodel} \
# --save_steps -1 \
# --logging_steps 1 \
# --eval_steps -1 \
# --zero_stage 3 \
# --beta 0.1 \
# --max_epochs 1 \
# --bf16 \
# --flash_attn \
# --learning_rate 5e-6 \
# --load_checkpoint \
# --gradient_checkpointing \
# --adam_offload \
# --use_tensorboard "${ftmodel}_log"
# python -m codecritic.cli.test_genrm \
# --model ${ftmodel} \
# --testset ${testset} \
# --output ${algolrscore} \
# --reasoning \
# --tp 1
# # ORM
# ormmodel="${project}/model/${modelname}-orm"
# ormscore="${data}/eval/${$modelname}-apps-test-orm-score.jsonl"
# deepspeed --module \
# openrlhf.cli.train_rm \
# --save_path ${ormmodel} \
# --save_steps -1 \
# --logging_steps 1 \
# --eval_steps -1 \
# --train_batch_size 128 \
# --micro_train_batch_size 1 \
# --pretrain ${model} \
# --bf16 \
# --max_epochs 1 \
# --max_len 8192 \
# --zero_stage 3 \
# --beta 0.1 \
# --learning_rate 9e-6 \
# --dataset ${reward_ds} \
# --apply_chat_template \
# --prompt_key messages \
# --chosen_key chosen \
# --rejected_key rejected \
# --flash_attn \
# --gradient_checkpointing \
# --adam_offload \
# --use_tensorboard "${ormmodel}_log"
# start_server() {
# echo "Starting server..."
# CUDA_VISIBLE_DEVICES=0,1,2,3 \
# python -m openrlhf.cli.serve_rm \
# --reward_pretrain ${ormmodel} \
# --normalize_reward \
# --port 5000 \
# --bf16 \
# --max_len 8192 &
# SERVER_PID=$!
# echo "Server started with PID: $SERVER_PID"
# }
# # Function to start the client
# start_client() {
# echo "Starting client..."
# python -m codecritic.cli.test_orm \
# --model ${ftmodel} \
# --testset ${testset} \
# --output ${ormscore}
# CLIENT_EXIT_CODE=$?
# echo "Client finished with exit code: $CLIENT_EXIT_CODE"
# }
# # Function to stop the server
# stop_server() {
# echo "Stopping server..."
# kill -SIGINT $SERVER_PID
# wait $SERVER_PID 2>/dev/null
# echo "Server stopped."
# }
# start_server
# # Give the server some time to initialize (optional)
# sleep 600
# start_client
# stop_server
# echo "Execution complete."
set -xe
timestamp="20250115_002719"
# timestamp=$(date +"%Y%m%d_%H%M%S")
model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-32B-Instruct"
project="/lustre/S/nanziyuan/projects/ccc"
modelname="qwen25_coder_inst_32b_${timestamp}"
tp=4
apps="/lustre/S/nanziyuan/datasets/apps/"
data="${project}/data"
trainset="${data}/train/${modelname}-apps-train.jsonl"
testset="${data}/test/${modelname}-apps-test.jsonl"
train_selected_pairs="${project}/data/train/${modelname}-apps-train-selected_pairs.jsonl"
reward_ds="${project}/data/train/${modelname}-apps-train-reward_dataset.jsonl"
python -m codecritic.cli.gen_dataset \
--model ${model} \
--apps ${apps} \
--train ${trainset} \
--test ${testset} \
--tp ${tp}
python -m codecritic.cli.select_preference_pairs \
--dataset ${trainset} \
--output ${train_selected_pairs}
python -m codecritic.cli.reformat \
--dataset ${trainset} \
--pairs ${train_selected_pairs} \
--format reward \
--output ${reward_ds}
# sft="${data}/train/${modelname}-sft.jsonl"
# ftmodel="${project}/model/${modelname}-algolr"
# algolrscore="${data}/eval/${$modelname}-apps-test-algolr-score.jsonl"
# python -m codecritic.cli.algolr \
# --model ${model} \
# --dataset ${trainset} \
# --pairinfo ${train_selected_pairs} \
# --apps ${apps} \
# --output ${sft} \
# --level beginner \
# --tp ${tp}
# deepspeed --module \
# openrlhf.cli.train_sft \
# --max_len 5632 \
# --dataset ${sft} \
# --input_key question \
# --output_key response \
# --apply_chat_template \
# --train_batch_size 32 \
# --micro_train_batch_size 1 \
# --max_samples 500000 \
# --pretrain ${model} \
# --save_path ${ftmodel} \
# --save_steps -1 \
# --logging_steps 1 \
# --eval_steps -1 \
# --zero_stage 3 \
# --beta 0.1 \
# --max_epochs 1 \
# --bf16 \
# --flash_attn \
# --learning_rate 5e-6 \
# --load_checkpoint \
# --gradient_checkpointing \
# --adam_offload \
# --use_tensorboard "${ftmodel}_log"
# python -m codecritic.cli.test_genrm \
# --model ${ftmodel} \
# --testset ${testset} \
# --output ${algolrscore} \
# --reasoning \
# --tp 1
# # ORM
# ormmodel="${project}/model/${modelname}-orm"
# ormscore="${data}/eval/${$modelname}-apps-test-orm-score.jsonl"
# deepspeed --module \
# openrlhf.cli.train_rm \
# --save_path ${ormmodel} \
# --save_steps -1 \
# --logging_steps 1 \
# --eval_steps -1 \
# --train_batch_size 128 \
# --micro_train_batch_size 1 \
# --pretrain ${model} \
# --bf16 \
# --max_epochs 1 \
# --max_len 8192 \
# --zero_stage 3 \
# --beta 0.1 \
# --learning_rate 9e-6 \
# --dataset ${reward_ds} \
# --apply_chat_template \
# --prompt_key messages \
# --chosen_key chosen \
# --rejected_key rejected \
# --flash_attn \
# --gradient_checkpointing \
# --adam_offload \
# --use_tensorboard "${ormmodel}_log"
# start_server() {
# echo "Starting server..."
# CUDA_VISIBLE_DEVICES=0,1,2,3 \
# python -m openrlhf.cli.serve_rm \
# --reward_pretrain ${ormmodel} \
# --normalize_reward \
# --port 5000 \
# --bf16 \
# --max_len 8192 &
# SERVER_PID=$!
# echo "Server started with PID: $SERVER_PID"
# }
# # Function to start the client
# start_client() {
# echo "Starting client..."
# python -m codecritic.cli.test_orm \
# --model ${ftmodel} \
# --testset ${testset} \
# --output ${ormscore}
# CLIENT_EXIT_CODE=$?
# echo "Client finished with exit code: $CLIENT_EXIT_CODE"
# }
# # Function to stop the server
# stop_server() {
# echo "Stopping server..."
# kill -SIGINT $SERVER_PID
# wait $SERVER_PID 2>/dev/null
# echo "Server stopped."
# }
# start_server
# # Give the server some time to initialize (optional)
# sleep 600
# start_client
# stop_server
# echo "Execution complete."
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment