Commit 6405c330 by nanziyuan

fix bugs

parent 048ea73a
import argparse import argparse
from collections import defaultdict
import json
from functools import partial from functools import partial
from pathlib import Path
import codecritic.evaluation.metric as metric import codecritic.evaluation.metric as metric
from codecritic.utils.json import load_jsonl, save_jsonl from codecritic.utils.json import load_jsonl
def eval(samples_path):
model, testset = samples_path.stem.split('-')[:2]
def f(item):
item["model"] = model
item["testset"] = testset
samples = load_jsonl(samples_path)
def eval(scores):
ks = list(range(1, 17)) ks = list(range(1, 17))
results = [] results = []
results.append(metric.pass_at_k(samples, ks)) results.extend(metric.pass_at_k(scores, ks))
results.append(metric.top_at_k(samples, ks, metric.postive_and_negative)) results.extend(metric.pass_at_k(scores, [50]))
results.append(metric.top_at_k(samples, ks, metric.positive_only)) results.extend(metric.top_at_k(scores, ks, metric.positive_only))
if "negative_score" in scores[0]:
results.extend(metric.top_at_k(scores, ks, metric.postive_and_negative))
for i in range(4): for i in range(4):
threshold = 0.5 + i * 0.1 threshold = 0.5 + i * 0.1
score_func = partial(metric.pos_neg_filter_uncertain, threshold=threshold) score_func = partial(metric.pos_neg_filter_uncertain, threshold=threshold)
results.append(metric.top_at_k(samples, ks, score_func)) results.extend(metric.top_at_k(scores, ks, score_func))
return list(map(f, results)) return results
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument("--score", type=str, help="path/to/score")
"--sample_dir",
type=str,
default=None,
help="Path to the directory containing samples. If not provided, cached results will be used."
)
parser.add_argument("--out_dir", type=str, help="path/to/output_dir")
parser.add_argument(
"--score_func",
type=str,
default="all",
choices=["all", "posonly", "posneg", "posneg_filter"], # Add valid options
help="Select the scoring function to use. Default: 'all'."
)
parser.add_argument("--plot", type=str, help="path/to/plot")
args = parser.parse_args() args = parser.parse_args()
outdir = Path(args.out_dir) scores = load_jsonl(args.score)
if args.sample_dir: groups = defaultdict(list)
for samples_path in Path(args.sample_dir).glob("*.jsonl"): for item in scores:
out_path = outdir / (samples_path.stem + "-eval.jsonl") groups[item["dataset"]].append(item)
if not out_path.exists():
eval_results = eval(samples_path) for dataset, lst in groups.items():
save_jsonl(eval_results, out_path) results = eval(lst)
for r in results:
for out_path in outdir.glob("*.jsonl"): r["dataset"] = dataset
pass print(json.dumps(r))
\ No newline at end of file
import argparse import argparse
from collections import defaultdict
from functools import partial from functools import partial
import os import os
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm import SamplingParams from vllm import SamplingParams
from codecritic.dataset.genrm_prompt import THINK_MESSAGE, JUDGE_MESSAGE, JUDGE_TOEKNS from codecritic.dataset.genrm_prompt import JUDGE_MESSAGE, JUDGE_TOEKNS
from codecritic.utils.inference import generate_worker, score_worker from codecritic.utils.inference import generate_worker, score_worker
from codecritic.utils.parallel import model_map from codecritic.utils.parallel import model_map
from codecritic.utils.json import load_jsonl, save_jsonl from codecritic.utils.json import load_jsonl, save_jsonl
import codecritic.dataset.algolr_prompt as algolr_prompt
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="path/to/model") parser.add_argument("--model", type=str, help="path/to/model")
parser.add_argument("--sample", type=str, help="path/to/sample") parser.add_argument("--testset", type=str, help="path/to/testset")
parser.add_argument("--output", type=str, help="path/to/score") parser.add_argument("--output", type=str, help="path/to/score")
parser.add_argument("--reasoning", action="store_true", help="enable reasoning") parser.add_argument("--reasoning", action="store_true", help="enable reasoning")
parser.add_argument( parser.add_argument(
...@@ -31,11 +33,24 @@ if __name__ == "__main__": ...@@ -31,11 +33,24 @@ if __name__ == "__main__":
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
tokenizer = AutoTokenizer.from_pretrained(args.model) tokenizer = AutoTokenizer.from_pretrained(args.model)
dataset = load_jsonl(args.sample) dataset = load_jsonl(args.testset)
ds = defaultdict(list)
for item in dataset:
ds[item["task_id"]].append(item)
unsolvable = []
dataset = []
for task_id, items in ds.items():
if all([not x["pass"] for x in items]):
for item in items:
item["positive_score"] = 0
item["negative_score"] = 0
unsolvable.extend(items)
else:
dataset.extend(items)
if args.reasoning: if args.reasoning:
for item in dataset: dataset = [algolr_prompt.process_to_reason_prompt(x, None) for x in dataset]
item["messages"].append(THINK_MESSAGE)
sampling_params = SamplingParams( sampling_params = SamplingParams(
n=1, n=1,
...@@ -68,4 +83,4 @@ if __name__ == "__main__": ...@@ -68,4 +83,4 @@ if __name__ == "__main__":
) )
dataset = model_map(worker, dataset, args.tp) dataset = model_map(worker, dataset, args.tp)
save_jsonl(dataset, args.output) save_jsonl(dataset + unsolvable, args.output)
import argparse import argparse
from collections import defaultdict
import json import json
import requests import requests
from tqdm import tqdm from tqdm import tqdm
...@@ -21,18 +22,28 @@ def get_rewards_from_server(server_url: str, messages: list[str]): ...@@ -21,18 +22,28 @@ def get_rewards_from_server(server_url: str, messages: list[str]):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="path/to/model") parser.add_argument("--model", type=str, help="path/to/model")
parser.add_argument("--sample", type=str, help="path/to/sample") parser.add_argument("--testset", type=str, help="path/to/testset")
parser.add_argument("--output", type=str, help="path/to/score") parser.add_argument("--output", type=str, help="path/to/score")
args = parser.parse_args() args = parser.parse_args()
# compute score # compute score
dataset = load_jsonl(args.sample) dataset = load_jsonl(args.testset)
ds = defaultdict(list)
for item in dataset:
ds[item["task_id"]].append(item)
for task_id, items in ds.items():
if all([not x["pass"] for x in items]):
for item in items:
item["positive_score"] = 0
server_url = "http://0.0.0.0:5000/get_reward" server_url = "http://0.0.0.0:5000/get_reward"
tokenizer = AutoTokenizer.from_pretrained(args.model) tokenizer = AutoTokenizer.from_pretrained(args.model)
for item in tqdm(dataset): for item in tqdm(dataset):
query = tokenizer.apply_chat_template(item["messages"], tokenize=False) if 'positive_score' not in item:
score = get_rewards_from_server(server_url, [query])[0] query = tokenizer.apply_chat_template(item["messages"], tokenize=False)
item["score"] = score score = get_rewards_from_server(server_url, [query])[0]
item["positive_score"] = score
save_jsonl(dataset, args.output) save_jsonl(dataset, args.output)
...@@ -13,18 +13,20 @@ apps="/lustre/S/nanziyuan/datasets/apps/" ...@@ -13,18 +13,20 @@ apps="/lustre/S/nanziyuan/datasets/apps/"
sft="${project}/data/train/${modelname}-sft.jsonl" sft="${project}/data/train/${modelname}-sft.jsonl"
ftmodel="${project}/model/qwen25_coder_inst_7b-algolr" ftmodel="${project}/model/qwen25_coder_inst_7b-algolr"
testset="/lustre/S/nanziyuan/projects/ccc/data/test/qwen25_coder_inst-apps-test.jsonl"
evalresults="/lustre/S/nanziyuan/projects/ccc/data/eval/qwen25_code_inst-apps-test-algolr-score.jsonl"
### export CUDA_VISIBLE_DEVICES=0,1,2,3 # export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m codecritic.cli.algolr \ # python -m codecritic.cli.algolr \
--model ${model} \ # --model ${model} \
--dataset ${trainset} \ # --dataset ${trainset} \
--pairinfo ${train_selected_pairs} \ # --pairinfo ${train_selected_pairs} \
--apps ${apps} \ # --apps ${apps} \
--output ${sft} \ # --output ${sft} \
--level beginner \ # --level beginner \
--tp 1 # --tp 1
deepspeed --module \ deepspeed --module \
...@@ -50,3 +52,11 @@ openrlhf.cli.train_sft \ ...@@ -50,3 +52,11 @@ openrlhf.cli.train_sft \
--load_checkpoint \ --load_checkpoint \
--gradient_checkpointing \ --gradient_checkpointing \
--use_tensorboard "${ftmodel}_log" --use_tensorboard "${ftmodel}_log"
python -m codecritic.cli.test_genrm \
--model ${model} \
--testset ${testset} \
--output ${evalresults} \
--reasoning \
--tp 1
...@@ -6,66 +6,69 @@ project="/lustre/S/nanziyuan/projects/ccc" ...@@ -6,66 +6,69 @@ project="/lustre/S/nanziyuan/projects/ccc"
model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/" model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/"
ftmodel="${project}/model/qwen25_coder_inst_7b-orm" ftmodel="${project}/model/qwen25_coder_inst_7b-orm"
deepspeed --module \ testset="/lustre/S/nanziyuan/projects/ccc/data/test/qwen25_coder_inst-apps-test.jsonl"
openrlhf.cli.train_rm \ evalresults="/lustre/S/nanziyuan/projects/ccc/data/eval/qwen25_code_inst-apps-test-orm-score.jsonl"
--save_path ${ftmodel} \
--save_steps -1 \
--logging_steps 1 \
--eval_steps -1 \
--train_batch_size 256 \
--micro_train_batch_size 1 \
--pretrain ${model} \
--bf16 \
--max_epochs 1 \
--max_len 8192 \
--zero_stage 3 \
--learning_rate 9e-6 \
--dataset ${dataset} \
--apply_chat_template \
--prompt_key messages \
--chosen_key chosen \
--rejected_key rejected \
--flash_attn \
--load_checkpoint \
--gradient_checkpointing \
--use_tensorboard "${ftmodel}_log"
# deepspeed --module \
# openrlhf.cli.train_rm \
# --save_path ${ftmodel} \
# --save_steps -1 \
# --logging_steps 1 \
# --eval_steps -1 \
# --train_batch_size 256 \
# --micro_train_batch_size 1 \
# --pretrain ${model} \
# --bf16 \
# --max_epochs 1 \
# --max_len 8192 \
# --zero_stage 3 \
# --learning_rate 9e-6 \
# --dataset ${dataset} \
# --apply_chat_template \
# --prompt_key messages \
# --chosen_key chosen \
# --rejected_key rejected \
# --flash_attn \
# --load_checkpoint \
# --gradient_checkpointing \
# --use_tensorboard "${ftmodel}_log"
# start_server() {
# echo "Starting server..."
# CUDA_VISIBLE_DEVICES=0 \
# python -m openrlhf.cli.serve_rm \
# --reward_pretrain ${model} \
# --normalize_reward \
# --port 5000 \
# --bf16 \
# --max_len 8192 &
# SERVER_PID=$!
# echo "Server started with PID: $SERVER_PID"
# }
# # Function to start the client start_server() {
# start_client() { echo "Starting server..."
# echo "Starting client..." CUDA_VISIBLE_DEVICES=0 \
# python -m codecritic.cli.run_rm_test \ python -m openrlhf.cli.serve_rm \
# --model ${model} \ --reward_pretrain ${ftmodel} \
# --test "${datasets}/sample/min_test.jsonl" \ --normalize_reward \
# --apps /lustre/S/nanziyuan/datasets/apps/ --port 5000 \
# CLIENT_EXIT_CODE=$? --bf16 \
# echo "Client finished with exit code: $CLIENT_EXIT_CODE" --max_len 8192 &
# } SERVER_PID=$!
echo "Server started with PID: $SERVER_PID"
}
# # Function to stop the server # Function to start the client
# stop_server() { start_client() {
# echo "Stopping server..." echo "Starting client..."
# kill -SIGINT $SERVER_PID python -m codecritic.cli.test_orm \
# wait $SERVER_PID 2>/dev/null --model ${ftmodel} \
# echo "Server stopped." --testset ${testset} \
# } --output ${evalresults}
CLIENT_EXIT_CODE=$?
echo "Client finished with exit code: $CLIENT_EXIT_CODE"
}
# start_server # Function to stop the server
# # Give the server some time to initialize (optional) stop_server() {
# sleep 60 echo "Stopping server..."
# start_client kill -SIGINT $SERVER_PID
# stop_server wait $SERVER_PID 2>/dev/null
# echo "Execution complete." echo "Server stopped."
}
start_server
# Give the server some time to initialize (optional)
sleep 60
start_client
stop_server
echo "Execution complete."
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment