Commit 6405c330 by nanziyuan

fix bugs

parent 048ea73a
import argparse
from collections import defaultdict
import json
from functools import partial
from pathlib import Path
import codecritic.evaluation.metric as metric
from codecritic.utils.json import load_jsonl, save_jsonl
def eval(samples_path):
model, testset = samples_path.stem.split('-')[:2]
def f(item):
item["model"] = model
item["testset"] = testset
samples = load_jsonl(samples_path)
from codecritic.utils.json import load_jsonl
def eval(scores):
ks = list(range(1, 17))
results = []
results.append(metric.pass_at_k(samples, ks))
results.append(metric.top_at_k(samples, ks, metric.postive_and_negative))
results.append(metric.top_at_k(samples, ks, metric.positive_only))
results.extend(metric.pass_at_k(scores, ks))
results.extend(metric.pass_at_k(scores, [50]))
results.extend(metric.top_at_k(scores, ks, metric.positive_only))
if "negative_score" in scores[0]:
results.extend(metric.top_at_k(scores, ks, metric.postive_and_negative))
for i in range(4):
threshold = 0.5 + i * 0.1
score_func = partial(metric.pos_neg_filter_uncertain, threshold=threshold)
results.append(metric.top_at_k(samples, ks, score_func))
for i in range(4):
threshold = 0.5 + i * 0.1
score_func = partial(metric.pos_neg_filter_uncertain, threshold=threshold)
results.extend(metric.top_at_k(scores, ks, score_func))
return list(map(f, results))
return results
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--sample_dir",
type=str,
default=None,
help="Path to the directory containing samples. If not provided, cached results will be used."
)
parser.add_argument("--out_dir", type=str, help="path/to/output_dir")
parser.add_argument(
"--score_func",
type=str,
default="all",
choices=["all", "posonly", "posneg", "posneg_filter"], # Add valid options
help="Select the scoring function to use. Default: 'all'."
)
parser.add_argument("--plot", type=str, help="path/to/plot")
parser.add_argument("--score", type=str, help="path/to/score")
args = parser.parse_args()
outdir = Path(args.out_dir)
if args.sample_dir:
for samples_path in Path(args.sample_dir).glob("*.jsonl"):
out_path = outdir / (samples_path.stem + "-eval.jsonl")
if not out_path.exists():
eval_results = eval(samples_path)
save_jsonl(eval_results, out_path)
for out_path in outdir.glob("*.jsonl"):
pass
\ No newline at end of file
scores = load_jsonl(args.score)
groups = defaultdict(list)
for item in scores:
groups[item["dataset"]].append(item)
for dataset, lst in groups.items():
results = eval(lst)
for r in results:
r["dataset"] = dataset
print(json.dumps(r))
import argparse
from collections import defaultdict
from functools import partial
import os
from transformers import AutoTokenizer
from vllm import SamplingParams
from codecritic.dataset.genrm_prompt import THINK_MESSAGE, JUDGE_MESSAGE, JUDGE_TOEKNS
from codecritic.dataset.genrm_prompt import JUDGE_MESSAGE, JUDGE_TOEKNS
from codecritic.utils.inference import generate_worker, score_worker
from codecritic.utils.parallel import model_map
from codecritic.utils.json import load_jsonl, save_jsonl
import codecritic.dataset.algolr_prompt as algolr_prompt
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="path/to/model")
parser.add_argument("--sample", type=str, help="path/to/sample")
parser.add_argument("--testset", type=str, help="path/to/testset")
parser.add_argument("--output", type=str, help="path/to/score")
parser.add_argument("--reasoning", action="store_true", help="enable reasoning")
parser.add_argument(
......@@ -31,11 +33,24 @@ if __name__ == "__main__":
os.environ["TOKENIZERS_PARALLELISM"] = "false"
tokenizer = AutoTokenizer.from_pretrained(args.model)
dataset = load_jsonl(args.sample)
dataset = load_jsonl(args.testset)
ds = defaultdict(list)
for item in dataset:
ds[item["task_id"]].append(item)
unsolvable = []
dataset = []
for task_id, items in ds.items():
if all([not x["pass"] for x in items]):
for item in items:
item["positive_score"] = 0
item["negative_score"] = 0
unsolvable.extend(items)
else:
dataset.extend(items)
if args.reasoning:
for item in dataset:
item["messages"].append(THINK_MESSAGE)
dataset = [algolr_prompt.process_to_reason_prompt(x, None) for x in dataset]
sampling_params = SamplingParams(
n=1,
......@@ -68,4 +83,4 @@ if __name__ == "__main__":
)
dataset = model_map(worker, dataset, args.tp)
save_jsonl(dataset, args.output)
save_jsonl(dataset + unsolvable, args.output)
import argparse
from collections import defaultdict
import json
import requests
from tqdm import tqdm
......@@ -21,18 +22,28 @@ def get_rewards_from_server(server_url: str, messages: list[str]):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="path/to/model")
parser.add_argument("--sample", type=str, help="path/to/sample")
parser.add_argument("--testset", type=str, help="path/to/testset")
parser.add_argument("--output", type=str, help="path/to/score")
args = parser.parse_args()
# compute score
dataset = load_jsonl(args.sample)
dataset = load_jsonl(args.testset)
ds = defaultdict(list)
for item in dataset:
ds[item["task_id"]].append(item)
for task_id, items in ds.items():
if all([not x["pass"] for x in items]):
for item in items:
item["positive_score"] = 0
server_url = "http://0.0.0.0:5000/get_reward"
tokenizer = AutoTokenizer.from_pretrained(args.model)
for item in tqdm(dataset):
query = tokenizer.apply_chat_template(item["messages"], tokenize=False)
score = get_rewards_from_server(server_url, [query])[0]
item["score"] = score
if 'positive_score' not in item:
query = tokenizer.apply_chat_template(item["messages"], tokenize=False)
score = get_rewards_from_server(server_url, [query])[0]
item["positive_score"] = score
save_jsonl(dataset, args.output)
......@@ -13,18 +13,20 @@ apps="/lustre/S/nanziyuan/datasets/apps/"
sft="${project}/data/train/${modelname}-sft.jsonl"
ftmodel="${project}/model/qwen25_coder_inst_7b-algolr"
testset="/lustre/S/nanziyuan/projects/ccc/data/test/qwen25_coder_inst-apps-test.jsonl"
evalresults="/lustre/S/nanziyuan/projects/ccc/data/eval/qwen25_code_inst-apps-test-algolr-score.jsonl"
### export CUDA_VISIBLE_DEVICES=0,1,2,3
# export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m codecritic.cli.algolr \
--model ${model} \
--dataset ${trainset} \
--pairinfo ${train_selected_pairs} \
--apps ${apps} \
--output ${sft} \
--level beginner \
--tp 1
# python -m codecritic.cli.algolr \
# --model ${model} \
# --dataset ${trainset} \
# --pairinfo ${train_selected_pairs} \
# --apps ${apps} \
# --output ${sft} \
# --level beginner \
# --tp 1
deepspeed --module \
......@@ -50,3 +52,11 @@ openrlhf.cli.train_sft \
--load_checkpoint \
--gradient_checkpointing \
--use_tensorboard "${ftmodel}_log"
python -m codecritic.cli.test_genrm \
--model ${model} \
--testset ${testset} \
--output ${evalresults} \
--reasoning \
--tp 1
......@@ -6,66 +6,69 @@ project="/lustre/S/nanziyuan/projects/ccc"
model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/"
ftmodel="${project}/model/qwen25_coder_inst_7b-orm"
deepspeed --module \
openrlhf.cli.train_rm \
--save_path ${ftmodel} \
--save_steps -1 \
--logging_steps 1 \
--eval_steps -1 \
--train_batch_size 256 \
--micro_train_batch_size 1 \
--pretrain ${model} \
--bf16 \
--max_epochs 1 \
--max_len 8192 \
--zero_stage 3 \
--learning_rate 9e-6 \
--dataset ${dataset} \
--apply_chat_template \
--prompt_key messages \
--chosen_key chosen \
--rejected_key rejected \
--flash_attn \
--load_checkpoint \
--gradient_checkpointing \
--use_tensorboard "${ftmodel}_log"
testset="/lustre/S/nanziyuan/projects/ccc/data/test/qwen25_coder_inst-apps-test.jsonl"
evalresults="/lustre/S/nanziyuan/projects/ccc/data/eval/qwen25_code_inst-apps-test-orm-score.jsonl"
# deepspeed --module \
# openrlhf.cli.train_rm \
# --save_path ${ftmodel} \
# --save_steps -1 \
# --logging_steps 1 \
# --eval_steps -1 \
# --train_batch_size 256 \
# --micro_train_batch_size 1 \
# --pretrain ${model} \
# --bf16 \
# --max_epochs 1 \
# --max_len 8192 \
# --zero_stage 3 \
# --learning_rate 9e-6 \
# --dataset ${dataset} \
# --apply_chat_template \
# --prompt_key messages \
# --chosen_key chosen \
# --rejected_key rejected \
# --flash_attn \
# --load_checkpoint \
# --gradient_checkpointing \
# --use_tensorboard "${ftmodel}_log"
# start_server() {
# echo "Starting server..."
# CUDA_VISIBLE_DEVICES=0 \
# python -m openrlhf.cli.serve_rm \
# --reward_pretrain ${model} \
# --normalize_reward \
# --port 5000 \
# --bf16 \
# --max_len 8192 &
# SERVER_PID=$!
# echo "Server started with PID: $SERVER_PID"
# }
# # Function to start the client
# start_client() {
# echo "Starting client..."
# python -m codecritic.cli.run_rm_test \
# --model ${model} \
# --test "${datasets}/sample/min_test.jsonl" \
# --apps /lustre/S/nanziyuan/datasets/apps/
# CLIENT_EXIT_CODE=$?
# echo "Client finished with exit code: $CLIENT_EXIT_CODE"
# }
start_server() {
echo "Starting server..."
CUDA_VISIBLE_DEVICES=0 \
python -m openrlhf.cli.serve_rm \
--reward_pretrain ${ftmodel} \
--normalize_reward \
--port 5000 \
--bf16 \
--max_len 8192 &
SERVER_PID=$!
echo "Server started with PID: $SERVER_PID"
}
# # Function to stop the server
# stop_server() {
# echo "Stopping server..."
# kill -SIGINT $SERVER_PID
# wait $SERVER_PID 2>/dev/null
# echo "Server stopped."
# }
# Function to start the client
start_client() {
echo "Starting client..."
python -m codecritic.cli.test_orm \
--model ${ftmodel} \
--testset ${testset} \
--output ${evalresults}
CLIENT_EXIT_CODE=$?
echo "Client finished with exit code: $CLIENT_EXIT_CODE"
}
# start_server
# # Give the server some time to initialize (optional)
# sleep 60
# start_client
# stop_server
# echo "Execution complete."
# Function to stop the server
stop_server() {
echo "Stopping server..."
kill -SIGINT $SERVER_PID
wait $SERVER_PID 2>/dev/null
echo "Server stopped."
}
start_server
# Give the server some time to initialize (optional)
sleep 60
start_client
stop_server
echo "Execution complete."
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment