Commit 97085438 by nanziyuan

test on train. fix minor bugs

parent 6405c330
......@@ -74,7 +74,7 @@ if __name__ == "__main__":
reason_prompts.append(chosen_prompt)
# rejected
rejected_hints = hints_dict[task_id][rejected_id]
rejected_hints = hints_dict[task_id][rejected_id]["hint"]
rejected_prompt = promptlib.process_to_reason_prompt(rejected, rejected_hints)
reason_prompts.append(rejected_prompt)
......
......@@ -10,17 +10,17 @@ def eval(scores):
ks = list(range(1, 17))
results = []
results.extend(metric.pass_at_k(scores, ks))
results.extend(metric.pass_at_k(scores, [50]))
results.extend(metric.top_at_k(scores, ks, metric.positive_only))
# results.extend(metric.pass_at_k(scores, ks))
# results.extend(metric.pass_at_k(scores, [50]))
# results.extend(metric.top_at_k(scores, ks, metric.positive_only))
if "negative_score" in scores[0]:
results.extend(metric.top_at_k(scores, ks, metric.postive_and_negative))
for i in range(4):
threshold = 0.5 + i * 0.1
score_func = partial(metric.pos_neg_filter_uncertain, threshold=threshold)
results.extend(metric.top_at_k(scores, ks, score_func))
# for i in range(4):
# threshold = 0.5 + i * 0.1
# score_func = partial(metric.pos_neg_filter_uncertain, threshold=threshold)
# results.extend(metric.top_at_k(scores, ks, score_func))
return results
......
import argparse
from functools import partial
import os
import pprint
from transformers import AutoTokenizer
from vllm import SamplingParams
from codecritic.dataset.genrm_prompt import JUDGE_MESSAGE, JUDGE_TOEKNS
from codecritic.utils.inference import generate_worker, score_worker
from codecritic.utils.parallel import model_map
from codecritic.utils.json import load_jsonl, save_jsonl
from codecritic.evaluation.metric import postive_and_negative, binary_metrics
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="path/to/model")
parser.add_argument("--trainset", type=str, help="path/to/testset")
parser.add_argument("--output", type=str, help="path/to/output")
parser.add_argument(
"--reason_max_tokens",
type=int,
default=4096,
help="maximum number of tokens allowed for the reasoning process.",
)
parser.add_argument(
"--tp", type=int, default=1, help="tensor parallel"
)
args = parser.parse_args()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
tokenizer = AutoTokenizer.from_pretrained(args.model)
dataset = load_jsonl(args.trainset)[:1000]
for item in dataset:
item["messages"] = item["question"]
item["pass"] = (item["response"][-1]["content"] == "Yes")
sampling_params = SamplingParams(
n=1,
temperature=0,
top_p=0.95,
max_tokens=args.reason_max_tokens,
)
worker = partial(
generate_worker, model_path=args.model, sampling_params=sampling_params
)
dataset = model_map(worker, dataset, args.tp)
def get_token_id(token):
score_tokens = tokenizer.encode(token, add_special_tokens=False)
assert len(score_tokens) == 1
return score_tokens[0]
positive_token = get_token_id(JUDGE_TOEKNS["positive"])
negative_token = get_token_id(JUDGE_TOEKNS["negative"])
for item in dataset:
item["messages"].append(JUDGE_MESSAGE)
worker = partial(
score_worker,
model_path=args.model,
positive_token=positive_token,
negative_token=negative_token,
)
dataset = model_map(worker, dataset, args.tp)
scores = [postive_and_negative(item) for item in dataset]
labels = [item["pass"] for item in dataset]
pprint.pp(binary_metrics(labels, scores))
save_jsonl(dataset, args.output)
......@@ -50,7 +50,10 @@ def positive_only(item):
def postive_and_negative(item):
pos = item["positive_score"]
neg = item["negative_score"]
return pos / (pos + neg)
if (pos + neg) == 0:
return 0
else:
return pos / (pos + neg)
def pos_neg_filter_uncertain(item, threshold):
......
......@@ -29,33 +29,33 @@ evalresults="/lustre/S/nanziyuan/projects/ccc/data/eval/qwen25_code_inst-apps-te
# --tp 1
deepspeed --module \
openrlhf.cli.train_sft \
--max_len 4096 \
--dataset ${sft} \
--input_key question \
--output_key response \
--apply_chat_template \
--train_batch_size 256 \
--micro_train_batch_size 2 \
--max_samples 500000 \
--pretrain ${model} \
--save_path ${ftmodel} \
--save_steps -1 \
--logging_steps 1 \
--eval_steps -1 \
--zero_stage 2 \
--max_epochs 1 \
--bf16 \
--flash_attn \
--learning_rate 5e-6 \
--load_checkpoint \
--gradient_checkpointing \
--use_tensorboard "${ftmodel}_log"
# deepspeed --module \
# openrlhf.cli.train_sft \
# --max_len 4096 \
# --dataset ${sft} \
# --input_key question \
# --output_key response \
# --apply_chat_template \
# --train_batch_size 256 \
# --micro_train_batch_size 2 \
# --max_samples 500000 \
# --pretrain ${model} \
# --save_path ${ftmodel} \
# --save_steps -1 \
# --logging_steps 1 \
# --eval_steps -1 \
# --zero_stage 2 \
# --max_epochs 1 \
# --bf16 \
# --flash_attn \
# --learning_rate 5e-6 \
# --load_checkpoint \
# --gradient_checkpointing \
# --use_tensorboard "${ftmodel}_log"
python -m codecritic.cli.test_genrm \
--model ${model} \
--model ${ftmodel} \
--testset ${testset} \
--output ${evalresults} \
--reasoning \
......
set -xe
model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/"
project="/lustre/S/nanziyuan/projects/ccc"
modelname="qwen25_coder_inst"
trainset="${project}/data/train/${modelname}-apps-train.jsonl"
testset="${project}/data/test/${modelname}-apps-test.jsonl"
train_selected_pairs="${project}/data/train/${modelname}-apps-train-selected_pairs.jsonl"
apps="/lustre/S/nanziyuan/datasets/apps/"
sft="${project}/data/train/${modelname}-sft.jsonl"
ftmodel="${project}/model/qwen25_coder_inst_7b-algolr"
testset="/lustre/S/nanziyuan/projects/ccc/data/test/qwen25_coder_inst-apps-test.jsonl"
evalresults="/lustre/S/nanziyuan/projects/ccc/data/eval/qwen25_code_inst-apps-test-genrm-score.jsonl"
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# python -m codecritic.cli.algolr \
# --model ${model} \
# --dataset ${trainset} \
# --pairinfo ${train_selected_pairs} \
# --apps ${apps} \
# --output ${sft} \
# --level beginner \
# --tp 1
# deepspeed --module \
# openrlhf.cli.train_sft \
# --max_len 4096 \
# --dataset ${sft} \
# --input_key question \
# --output_key response \
# --apply_chat_template \
# --train_batch_size 256 \
# --micro_train_batch_size 2 \
# --max_samples 500000 \
# --pretrain ${model} \
# --save_path ${ftmodel} \
# --save_steps -1 \
# --logging_steps 1 \
# --eval_steps -1 \
# --zero_stage 2 \
# --max_epochs 1 \
# --bf16 \
# --flash_attn \
# --learning_rate 5e-6 \
# --load_checkpoint \
# --gradient_checkpointing \
# --use_tensorboard "${ftmodel}_log"
python -m codecritic.cli.test_genrm \
--model ${model} \
--testset ${testset} \
--output ${evalresults} \
--reasoning \
--tp 1
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment