Commit 36054b07 by nzy

Merge branch 'main' of http://62.234.201.16/nzy/codecritic

parents 030e1e12 2a43e44e
import argparse import argparse
from collections import defaultdict from collections import defaultdict
from functools import partial from functools import partial
import pprint
import random
from vllm import SamplingParams from vllm import SamplingParams
from datasets import load_dataset from datasets import load_dataset
...@@ -51,11 +53,14 @@ if __name__ == "__main__": ...@@ -51,11 +53,14 @@ if __name__ == "__main__":
worker = partial(generate_worker, model_path=args.model, sampling_params=sampling_params) worker = partial(generate_worker, model_path=args.model, sampling_params=sampling_params)
hint_responses = model_map(worker, hint_prompts, args.tp) hint_responses = model_map(worker, hint_prompts, args.tp)
pprint.pp(hint_responses[0])
hints = [promptlib.postprocess_to_hint(x) for x in hint_responses] hints = [promptlib.postprocess_to_hint(x) for x in hint_responses]
# hints: {"dataset"..., "task_id": ..., "solution_id": ..., "hints": ...} # hints: {"dataset"..., "task_id": ..., "solution_id": ..., "hints": ...}
# save_jsonl(hint_responses, args.output + ".hint_responses")
save_jsonl(hints, args.output + ".hints") save_jsonl(hints, args.output + ".hints")
# hints = load_jsonl(args.output + ".hints")
hints_dict = defaultdict(dict) hints_dict = defaultdict(dict)
for item in hints: for item in hints:
...@@ -73,7 +78,7 @@ if __name__ == "__main__": ...@@ -73,7 +78,7 @@ if __name__ == "__main__":
reason_prompts.append(chosen_prompt) reason_prompts.append(chosen_prompt)
# rejected # rejected
rejected_hints = hints_dict[task_id][rejected_id] rejected_hints = hints_dict[task_id][rejected_id]["hint"]
rejected_prompt = promptlib.process_to_reason_prompt(rejected, rejected_hints) rejected_prompt = promptlib.process_to_reason_prompt(rejected, rejected_hints)
reason_prompts.append(rejected_prompt) reason_prompts.append(rejected_prompt)
...@@ -86,6 +91,8 @@ if __name__ == "__main__": ...@@ -86,6 +91,8 @@ if __name__ == "__main__":
worker = partial(generate_worker, model_path=args.model, sampling_params=sampling_params) worker = partial(generate_worker, model_path=args.model, sampling_params=sampling_params)
reason_responses = model_map(worker, reason_prompts, args.tp) reason_responses = model_map(worker, reason_prompts, args.tp)
pprint.pp(reason_responses[0])
save_jsonl(reason_responses, args.output + ".reason")
# Step3 Verify reasoning results # Step3 Verify reasoning results
# add prompt "correct the code based the reasoning" # add prompt "correct the code based the reasoning"
...@@ -114,6 +121,7 @@ if __name__ == "__main__": ...@@ -114,6 +121,7 @@ if __name__ == "__main__":
worker = partial(generate_worker, model_path=args.model, sampling_params=sampling_params) worker = partial(generate_worker, model_path=args.model, sampling_params=sampling_params)
verify_responses = model_map(worker, reason_responses, args.tp) verify_responses = model_map(worker, reason_responses, args.tp)
pprint.pp(verify_responses[0])
print("verify response size: {}".format(len(verify_responses))) print("verify response size: {}".format(len(verify_responses)))
# postprocess verify_response. # postprocess verify_response.
...@@ -148,8 +156,7 @@ if __name__ == "__main__": ...@@ -148,8 +156,7 @@ if __name__ == "__main__":
print("Corrects (execution consistent) size: {}".format(len(corrects))) print("Corrects (execution consistent) size: {}".format(len(corrects)))
print("Incorrects (execution consistent) size: {}".format(len(incorrects))) print("Incorrects (execution consistent) size: {}".format(len(incorrects)))
# Step4 Remove hints and Reformat to a SFT dataset # Step4 Reformat to a SFT dataset
# extract reasoning sets
sft = [] sft = []
for item in verify_passed: for item in verify_passed:
...@@ -169,4 +176,26 @@ if __name__ == "__main__": ...@@ -169,4 +176,26 @@ if __name__ == "__main__":
sft.append(line) sft.append(line)
print("Size of sft dataset: {}".format(len(sft))) print("Size of sft dataset: {}".format(len(sft)))
pprint.pp(sft[0])
save_jsonl(sft, args.output) save_jsonl(sft, args.output)
# Step5 keep 1 rationale for 1 solution
task_solution_map = defaultdict(lambda: defaultdict(list))
for entry in sft:
task_id = entry["task_id"]
solution_id = entry["solution_id"]
task_solution_map[task_id][solution_id.split("_")[0]].append(entry)
# Step 2: Keep only one reasoning for each solution
processed_dataset = []
for task_id, solution_map in task_solution_map.items():
for solution, reasoning_list in solution_map.items():
if len(reasoning_list) > 1:
selected_index = random.choice(range(1, len(reasoning_list)))
processed_dataset.append(reasoning_list[selected_index])
else:
processed_dataset.append(reasoning_list[0])
save_jsonl(processed_dataset, args.output.split('.')[0] + "-filtered.jsonl")
import argparse import argparse
from collections import defaultdict
import json
from functools import partial from functools import partial
from pathlib import Path
import codecritic.evaluation.metric as metric import codecritic.evaluation.metric as metric
from codecritic.utils.json import load_jsonl, save_jsonl from codecritic.utils.json import load_jsonl
def eval(samples_path):
model, testset = samples_path.stem.split('-')[:2]
def f(item):
item["model"] = model
item["testset"] = testset
samples = load_jsonl(samples_path)
def eval(scores):
ks = list(range(1, 17)) ks = list(range(1, 17))
results = [] results = []
results.append(metric.pass_at_k(samples, ks)) # results.extend(metric.pass_at_k(scores, ks))
results.append(metric.top_at_k(samples, ks, metric.postive_and_negative)) # results.extend(metric.pass_at_k(scores, [50]))
results.append(metric.top_at_k(samples, ks, metric.positive_only)) # results.extend(metric.top_at_k(scores, ks, metric.positive_only))
if "negative_score" in scores[0]:
results.extend(metric.top_at_k(scores, ks, metric.postive_and_negative))
for i in range(4): # for i in range(4):
threshold = 0.5 + i * 0.1 # threshold = 0.5 + i * 0.1
score_func = partial(metric.pos_neg_filter_uncertain, threshold=threshold) # score_func = partial(metric.pos_neg_filter_uncertain, threshold=threshold)
results.append(metric.top_at_k(samples, ks, score_func)) # results.extend(metric.top_at_k(scores, ks, score_func))
return list(map(f, results)) return results
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument("--score", type=str, help="path/to/score")
"--sample_dir",
type=str,
default=None,
help="Path to the directory containing samples. If not provided, cached results will be used."
)
parser.add_argument("--out_dir", type=str, help="path/to/output_dir")
parser.add_argument(
"--score_func",
type=str,
default="all",
choices=["all", "posonly", "posneg", "posneg_filter"], # Add valid options
help="Select the scoring function to use. Default: 'all'."
)
parser.add_argument("--plot", type=str, help="path/to/plot")
args = parser.parse_args() args = parser.parse_args()
outdir = Path(args.out_dir) scores = load_jsonl(args.score)
if args.sample_dir: groups = defaultdict(list)
for samples_path in Path(args.sample_dir).glob("*.jsonl"): for item in scores:
out_path = outdir / (samples_path.stem + "-eval.jsonl") groups[item["dataset"]].append(item)
if not out_path.exists():
eval_results = eval(samples_path)
save_jsonl(eval_results, out_path)
for out_path in outdir.glob("*.jsonl"): for dataset, lst in groups.items():
pass results = eval(lst)
\ No newline at end of file for r in results:
r["dataset"] = dataset
print(json.dumps(r))
import argparse import argparse
from collections import defaultdict
from functools import partial from functools import partial
import os import os
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm import SamplingParams from vllm import SamplingParams
from codecritic.dataset.genrm_prompt import THINK_MESSAGE, JUDGE_MESSAGE, JUDGE_TOEKNS from codecritic.dataset.genrm_prompt import JUDGE_MESSAGE, JUDGE_TOEKNS
from codecritic.utils.inference import generate_worker, score_worker from codecritic.utils.inference import generate_worker, score_worker
from codecritic.utils.parallel import model_map from codecritic.utils.parallel import model_map
from codecritic.utils.json import load_jsonl, save_jsonl from codecritic.utils.json import load_jsonl, save_jsonl
import codecritic.dataset.algolr_prompt as algolr_prompt
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="path/to/model") parser.add_argument("--model", type=str, help="path/to/model")
parser.add_argument("--sample", type=str, help="path/to/sample") parser.add_argument("--testset", type=str, help="path/to/testset")
parser.add_argument("--output", type=str, help="path/to/score") parser.add_argument("--output", type=str, help="path/to/score")
parser.add_argument("--reasoning", action="store_true", help="enable reasoning") parser.add_argument("--reasoning", action="store_true", help="enable reasoning")
parser.add_argument( parser.add_argument(
...@@ -31,11 +33,24 @@ if __name__ == "__main__": ...@@ -31,11 +33,24 @@ if __name__ == "__main__":
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
tokenizer = AutoTokenizer.from_pretrained(args.model) tokenizer = AutoTokenizer.from_pretrained(args.model)
dataset = load_jsonl(args.sample) dataset = load_jsonl(args.testset)
ds = defaultdict(list)
for item in dataset:
ds[item["task_id"]].append(item)
unsolvable = []
dataset = []
for task_id, items in ds.items():
if all([not x["pass"] for x in items]):
for item in items:
item["positive_score"] = 0
item["negative_score"] = 0
unsolvable.extend(items)
else:
dataset.extend(items)
if args.reasoning: if args.reasoning:
for item in dataset: dataset = [algolr_prompt.process_to_reason_prompt(x, None) for x in dataset]
item["messages"].append(THINK_MESSAGE)
sampling_params = SamplingParams( sampling_params = SamplingParams(
n=1, n=1,
...@@ -68,4 +83,4 @@ if __name__ == "__main__": ...@@ -68,4 +83,4 @@ if __name__ == "__main__":
) )
dataset = model_map(worker, dataset, args.tp) dataset = model_map(worker, dataset, args.tp)
save_jsonl(dataset, args.output) save_jsonl(dataset + unsolvable, args.output)
import argparse import argparse
from collections import defaultdict
import json import json
import requests import requests
from tqdm import tqdm from tqdm import tqdm
...@@ -21,18 +22,28 @@ def get_rewards_from_server(server_url: str, messages: list[str]): ...@@ -21,18 +22,28 @@ def get_rewards_from_server(server_url: str, messages: list[str]):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="path/to/model") parser.add_argument("--model", type=str, help="path/to/model")
parser.add_argument("--sample", type=str, help="path/to/sample") parser.add_argument("--testset", type=str, help="path/to/testset")
parser.add_argument("--output", type=str, help="path/to/score") parser.add_argument("--output", type=str, help="path/to/score")
args = parser.parse_args() args = parser.parse_args()
# compute score # compute score
dataset = load_jsonl(args.sample) dataset = load_jsonl(args.testset)
ds = defaultdict(list)
for item in dataset:
ds[item["task_id"]].append(item)
for task_id, items in ds.items():
if all([not x["pass"] for x in items]):
for item in items:
item["positive_score"] = 0
server_url = "http://0.0.0.0:5000/get_reward" server_url = "http://0.0.0.0:5000/get_reward"
tokenizer = AutoTokenizer.from_pretrained(args.model) tokenizer = AutoTokenizer.from_pretrained(args.model)
for item in tqdm(dataset): for item in tqdm(dataset):
if 'positive_score' not in item:
query = tokenizer.apply_chat_template(item["messages"], tokenize=False) query = tokenizer.apply_chat_template(item["messages"], tokenize=False)
score = get_rewards_from_server(server_url, [query])[0] score = get_rewards_from_server(server_url, [query])[0]
item["score"] = score item["positive_score"] = score
save_jsonl(dataset, args.output) save_jsonl(dataset, args.output)
import argparse
from functools import partial
import os
import pprint
from transformers import AutoTokenizer
from vllm import SamplingParams
from codecritic.dataset.genrm_prompt import JUDGE_MESSAGE, JUDGE_TOEKNS
from codecritic.utils.inference import generate_worker, score_worker
from codecritic.utils.parallel import model_map
from codecritic.utils.json import load_jsonl, save_jsonl
from codecritic.evaluation.metric import postive_and_negative, binary_metrics
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="path/to/model")
parser.add_argument("--trainset", type=str, help="path/to/testset")
parser.add_argument("--output", type=str, help="path/to/output")
parser.add_argument(
"--reason_max_tokens",
type=int,
default=4096,
help="maximum number of tokens allowed for the reasoning process.",
)
parser.add_argument(
"--tp", type=int, default=1, help="tensor parallel"
)
args = parser.parse_args()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
tokenizer = AutoTokenizer.from_pretrained(args.model)
dataset = load_jsonl(args.trainset)[:1000]
for item in dataset:
item["messages"] = item["question"]
item["pass"] = (item["response"][-1]["content"] == "Yes")
sampling_params = SamplingParams(
n=1,
temperature=0,
top_p=0.95,
max_tokens=args.reason_max_tokens,
)
worker = partial(
generate_worker, model_path=args.model, sampling_params=sampling_params
)
dataset = model_map(worker, dataset, args.tp)
def get_token_id(token):
score_tokens = tokenizer.encode(token, add_special_tokens=False)
assert len(score_tokens) == 1
return score_tokens[0]
positive_token = get_token_id(JUDGE_TOEKNS["positive"])
negative_token = get_token_id(JUDGE_TOEKNS["negative"])
for item in dataset:
item["messages"].append(JUDGE_MESSAGE)
worker = partial(
score_worker,
model_path=args.model,
positive_token=positive_token,
negative_token=negative_token,
)
dataset = model_map(worker, dataset, args.tp)
scores = [postive_and_negative(item) for item in dataset]
labels = [item["pass"] for item in dataset]
pprint.pp(binary_metrics(labels, scores))
save_jsonl(dataset, args.output)
...@@ -152,10 +152,7 @@ def remove_hint(item): ...@@ -152,10 +152,7 @@ def remove_hint(item):
def extract_conclusion_and_code(response): def extract_conclusion_and_code(response):
# Extract conclusion # Extract conclusion
if 'Conclusion:' not in response: try:
conclusion = None
print("not found conclusion\n{}".format(response))
else:
conclusion_line = [line for line in response.split('\n') if line.startswith('Conclusion:')][0] conclusion_line = [line for line in response.split('\n') if line.startswith('Conclusion:')][0]
conclusion_str = conclusion_line.split(': ')[1].strip().lower() conclusion_str = conclusion_line.split(': ')[1].strip().lower()
...@@ -166,6 +163,9 @@ def extract_conclusion_and_code(response): ...@@ -166,6 +163,9 @@ def extract_conclusion_and_code(response):
else: else:
print("llm doesn't draw to a conclusion\n{}".format(response)) print("llm doesn't draw to a conclusion\n{}".format(response))
conclusion = None conclusion = None
except Exception as e:
print("not found conclusion\n{}\n{}".format(response, e))
conclusion = None
# Extract corrected code if conclusion is 'No' # Extract corrected code if conclusion is 'No'
corrected_code = "" corrected_code = ""
......
...@@ -84,7 +84,7 @@ def evaluate_code_samples(code_samples, apps): ...@@ -84,7 +84,7 @@ def evaluate_code_samples(code_samples, apps):
cpu_num = multiprocessing.cpu_count() // 2 cpu_num = multiprocessing.cpu_count() // 2
chunksize = max(len(code_samples) // (cpu_num * 10), 1) chunksize = max(len(code_samples) // (cpu_num * 10), 1)
results = process_map( results = process_map(
test_generation, args, max_workers=cpu_num, chunksize=chunksize test_generation, args, max_workers=cpu_num, chunksize=1
) )
return results return results
...@@ -100,7 +100,7 @@ def evaluate(code_samples, apps): ...@@ -100,7 +100,7 @@ def evaluate(code_samples, apps):
The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result. The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result.
""" """
all_results = [] all_results = []
for _ in range(2): for _ in range(1):
results = evaluate_code_samples(code_samples, apps) results = evaluate_code_samples(code_samples, apps)
all_results.append(results) all_results.append(results)
......
...@@ -350,9 +350,9 @@ def run_test(sample, test=None, debug=False): ...@@ -350,9 +350,9 @@ def run_test(sample, test=None, debug=False):
# try by converting the stuff into split up list # try by converting the stuff into split up list
if isinstance(in_outs["outputs"][index], list): if isinstance(in_outs["outputs"][index], list):
for tmp_index, i in enumerate(in_outs["outputs"][index]): for tmp_index, i in enumerate(in_outs["outputs"][index]):
in_outs["outputs"][index][tmp_index] = set(i.split()) in_outs["outputs"][index][tmp_index] = list(i.split())
else: else:
in_outs["outputs"][index] = set(in_outs["outputs"][index].split()) in_outs["outputs"][index] = list(in_outs["outputs"][index].split())
try: try:
tmp_result = (output == in_outs["outputs"][index]) tmp_result = (output == in_outs["outputs"][index])
...@@ -371,14 +371,14 @@ def run_test(sample, test=None, debug=False): ...@@ -371,14 +371,14 @@ def run_test(sample, test=None, debug=False):
output[tmp_index] = i.split() output[tmp_index] = i.split()
output = list(filter(len, output)) output = list(filter(len, output))
for tmp_index, i in enumerate(output): for tmp_index, i in enumerate(output):
output[tmp_index] = set(i) output[tmp_index] = list(i)
else: else:
output = output.split() output = output.split()
output = list(filter(len, output)) output = list(filter(len, output))
output = set(output) output = list(output)
try: try:
tmp_result = (set(frozenset(s) for s in output) == set(frozenset(s) for s in in_outs["outputs"][index])) tmp_result = (list(list(s) for s in output) == list(list(s) for s in in_outs["outputs"][index]))
except Exception as e: except Exception as e:
if debug: if debug:
print(f"Failed check5 exception = {e}") print(f"Failed check5 exception = {e}")
...@@ -386,8 +386,8 @@ def run_test(sample, test=None, debug=False): ...@@ -386,8 +386,8 @@ def run_test(sample, test=None, debug=False):
# if they are all numbers, round so that similar numbers are treated as identical # if they are all numbers, round so that similar numbers are treated as identical
try: try:
tmp_result = tmp_result or (set(frozenset(round(float(t),3) for t in s) for s in output) ==\ tmp_result = tmp_result or (list(list(round(float(t),3) for t in s) for s in output) ==\
set(frozenset(round(float(t),3) for t in s) for s in in_outs["outputs"][index])) list(list(round(float(t),3) for t in s) for s in in_outs["outputs"][index]))
except Exception as e: except Exception as e:
if debug: if debug:
print(f"Failed check6 exception = {e}") print(f"Failed check6 exception = {e}")
......
...@@ -50,6 +50,9 @@ def positive_only(item): ...@@ -50,6 +50,9 @@ def positive_only(item):
def postive_and_negative(item): def postive_and_negative(item):
pos = item["positive_score"] pos = item["positive_score"]
neg = item["negative_score"] neg = item["negative_score"]
if (pos + neg) == 0:
return 0
else:
return pos / (pos + neg) return pos / (pos + neg)
......
...@@ -3,18 +3,21 @@ set -xe ...@@ -3,18 +3,21 @@ set -xe
model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/" model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/"
project="/lustre/S/nanziyuan/projects/ccc" project="/lustre/S/nanziyuan/projects/ccc"
modelname="qwen25_coder_inst" modelname="qwen25_coder_inst"
data="${project}/data"
trainset="${project}/data/train/${modelname}-apps-train.jsonl" trainset="${data}/train/${modelname}-apps-train.jsonl"
testset="${project}/data/test/${modelname}-apps-test.jsonl" testset="${data}/test/${modelname}-apps-test.jsonl"
train_selected_pairs="${project}/data/train/${modelname}-apps-train-selected_pairs.jsonl" train_selected_pairs="${data}/train/${modelname}-apps-train-selected_pairs.jsonl"
apps="/lustre/S/nanziyuan/datasets/apps/" apps="/lustre/S/nanziyuan/datasets/apps/"
sft="${project}/data/train/${modelname}-sft.jsonl" sft="${data}/train/${modelname}-sft.jsonl"
ftmodel="${project}/model/qwen25_coder_inst_7b-algolr" ftmodel="${project}/model/qwen25_coder_inst_7b-algolr"
testset="${data}/test/qwen25_coder_inst-apps-test.jsonl"
evalresults="${data}/eval/qwen25_code_inst-apps-test-algolr-score.jsonl"
### export CUDA_VISIBLE_DEVICES=0,1,2,3 # export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m codecritic.cli.algolr \ python -m codecritic.cli.algolr \
...@@ -50,3 +53,11 @@ openrlhf.cli.train_sft \ ...@@ -50,3 +53,11 @@ openrlhf.cli.train_sft \
--load_checkpoint \ --load_checkpoint \
--gradient_checkpointing \ --gradient_checkpointing \
--use_tensorboard "${ftmodel}_log" --use_tensorboard "${ftmodel}_log"
python -m codecritic.cli.test_genrm \
--model ${ftmodel} \
--testset ${testset} \
--output ${evalresults} \
--reasoning \
--tp 1
...@@ -12,7 +12,7 @@ train_selected_pairs="${project}/data/train/${modelname}-apps-train-selected_pai ...@@ -12,7 +12,7 @@ train_selected_pairs="${project}/data/train/${modelname}-apps-train-selected_pai
reward_ds="${project}/data/train/${modelname}-apps-train-reward_dataset.jsonl" reward_ds="${project}/data/train/${modelname}-apps-train-reward_dataset.jsonl"
export CUDA_VISIBLE_DEVICES=0,1,2,3 # export CUDA_VISIBLE_DEVICES=0,1,2,3
## Sampling ## Sampling
## APPS ## APPS
......
set -xe
model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/"
project="/lustre/S/nanziyuan/projects/ccc"
modelname="qwen25_coder_inst"
trainset="${project}/data/train/${modelname}-apps-train.jsonl"
testset="${project}/data/test/${modelname}-apps-test.jsonl"
train_selected_pairs="${project}/data/train/${modelname}-apps-train-selected_pairs.jsonl"
apps="/lustre/S/nanziyuan/datasets/apps/"
sft="${project}/data/train/${modelname}-sft.jsonl"
ftmodel="${project}/model/qwen25_coder_inst_7b-algolr"
testset="/lustre/S/nanziyuan/projects/ccc/data/test/qwen25_coder_inst-apps-test.jsonl"
evalresults="/lustre/S/nanziyuan/projects/ccc/data/eval/qwen25_code_inst-apps-test-genrm-score.jsonl"
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# python -m codecritic.cli.algolr \
# --model ${model} \
# --dataset ${trainset} \
# --pairinfo ${train_selected_pairs} \
# --apps ${apps} \
# --output ${sft} \
# --level beginner \
# --tp 1
# deepspeed --module \
# openrlhf.cli.train_sft \
# --max_len 4096 \
# --dataset ${sft} \
# --input_key question \
# --output_key response \
# --apply_chat_template \
# --train_batch_size 256 \
# --micro_train_batch_size 2 \
# --max_samples 500000 \
# --pretrain ${model} \
# --save_path ${ftmodel} \
# --save_steps -1 \
# --logging_steps 1 \
# --eval_steps -1 \
# --zero_stage 2 \
# --max_epochs 1 \
# --bf16 \
# --flash_attn \
# --learning_rate 5e-6 \
# --load_checkpoint \
# --gradient_checkpointing \
# --use_tensorboard "${ftmodel}_log"
python -m codecritic.cli.test_genrm \
--model ${model} \
--testset ${testset} \
--output ${evalresults} \
--reasoning \
--tp 1
...@@ -6,6 +6,9 @@ project="/lustre/S/nanziyuan/projects/ccc" ...@@ -6,6 +6,9 @@ project="/lustre/S/nanziyuan/projects/ccc"
model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/" model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/"
ftmodel="${project}/model/qwen25_coder_inst_7b-orm" ftmodel="${project}/model/qwen25_coder_inst_7b-orm"
testset="/lustre/S/nanziyuan/projects/ccc/data/test/qwen25_coder_inst-apps-test.jsonl"
evalresults="/lustre/S/nanziyuan/projects/ccc/data/eval/qwen25_code_inst-apps-test-orm-score.jsonl"
deepspeed --module \ deepspeed --module \
openrlhf.cli.train_rm \ openrlhf.cli.train_rm \
--save_path ${ftmodel} \ --save_path ${ftmodel} \
...@@ -31,41 +34,41 @@ openrlhf.cli.train_rm \ ...@@ -31,41 +34,41 @@ openrlhf.cli.train_rm \
--use_tensorboard "${ftmodel}_log" --use_tensorboard "${ftmodel}_log"
# start_server() { start_server() {
# echo "Starting server..." echo "Starting server..."
# CUDA_VISIBLE_DEVICES=0 \ CUDA_VISIBLE_DEVICES=0 \
# python -m openrlhf.cli.serve_rm \ python -m openrlhf.cli.serve_rm \
# --reward_pretrain ${model} \ --reward_pretrain ${ftmodel} \
# --normalize_reward \ --normalize_reward \
# --port 5000 \ --port 5000 \
# --bf16 \ --bf16 \
# --max_len 8192 & --max_len 8192 &
# SERVER_PID=$! SERVER_PID=$!
# echo "Server started with PID: $SERVER_PID" echo "Server started with PID: $SERVER_PID"
# } }
# # Function to start the client # Function to start the client
# start_client() { start_client() {
# echo "Starting client..." echo "Starting client..."
# python -m codecritic.cli.run_rm_test \ python -m codecritic.cli.test_orm \
# --model ${model} \ --model ${ftmodel} \
# --test "${datasets}/sample/min_test.jsonl" \ --testset ${testset} \
# --apps /lustre/S/nanziyuan/datasets/apps/ --output ${evalresults}
# CLIENT_EXIT_CODE=$? CLIENT_EXIT_CODE=$?
# echo "Client finished with exit code: $CLIENT_EXIT_CODE" echo "Client finished with exit code: $CLIENT_EXIT_CODE"
# } }
# # Function to stop the server # Function to stop the server
# stop_server() { stop_server() {
# echo "Stopping server..." echo "Stopping server..."
# kill -SIGINT $SERVER_PID kill -SIGINT $SERVER_PID
# wait $SERVER_PID 2>/dev/null wait $SERVER_PID 2>/dev/null
# echo "Server stopped." echo "Server stopped."
# } }
# start_server start_server
# # Give the server some time to initialize (optional) # Give the server some time to initialize (optional)
# sleep 60 sleep 60
# start_client start_client
# stop_server stop_server
# echo "Execution complete." echo "Execution complete."
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment