Commit 36054b07 by nzy

Merge branch 'main' of http://62.234.201.16/nzy/codecritic

parents 030e1e12 2a43e44e
import argparse
from collections import defaultdict
from functools import partial
import pprint
import random
from vllm import SamplingParams
from datasets import load_dataset
......@@ -51,11 +53,14 @@ if __name__ == "__main__":
worker = partial(generate_worker, model_path=args.model, sampling_params=sampling_params)
hint_responses = model_map(worker, hint_prompts, args.tp)
pprint.pp(hint_responses[0])
hints = [promptlib.postprocess_to_hint(x) for x in hint_responses]
# hints: {"dataset"..., "task_id": ..., "solution_id": ..., "hints": ...}
# save_jsonl(hint_responses, args.output + ".hint_responses")
save_jsonl(hints, args.output + ".hints")
# hints = load_jsonl(args.output + ".hints")
hints_dict = defaultdict(dict)
for item in hints:
......@@ -73,7 +78,7 @@ if __name__ == "__main__":
reason_prompts.append(chosen_prompt)
# rejected
rejected_hints = hints_dict[task_id][rejected_id]
rejected_hints = hints_dict[task_id][rejected_id]["hint"]
rejected_prompt = promptlib.process_to_reason_prompt(rejected, rejected_hints)
reason_prompts.append(rejected_prompt)
......@@ -86,6 +91,8 @@ if __name__ == "__main__":
worker = partial(generate_worker, model_path=args.model, sampling_params=sampling_params)
reason_responses = model_map(worker, reason_prompts, args.tp)
pprint.pp(reason_responses[0])
save_jsonl(reason_responses, args.output + ".reason")
# Step3 Verify reasoning results
# add prompt "correct the code based the reasoning"
......@@ -114,6 +121,7 @@ if __name__ == "__main__":
worker = partial(generate_worker, model_path=args.model, sampling_params=sampling_params)
verify_responses = model_map(worker, reason_responses, args.tp)
pprint.pp(verify_responses[0])
print("verify response size: {}".format(len(verify_responses)))
# postprocess verify_response.
......@@ -148,8 +156,7 @@ if __name__ == "__main__":
print("Corrects (execution consistent) size: {}".format(len(corrects)))
print("Incorrects (execution consistent) size: {}".format(len(incorrects)))
# Step4 Remove hints and Reformat to a SFT dataset
# extract reasoning sets
# Step4 Reformat to a SFT dataset
sft = []
for item in verify_passed:
......@@ -169,4 +176,26 @@ if __name__ == "__main__":
sft.append(line)
print("Size of sft dataset: {}".format(len(sft)))
pprint.pp(sft[0])
save_jsonl(sft, args.output)
# Step5 keep 1 rationale for 1 solution
task_solution_map = defaultdict(lambda: defaultdict(list))
for entry in sft:
task_id = entry["task_id"]
solution_id = entry["solution_id"]
task_solution_map[task_id][solution_id.split("_")[0]].append(entry)
# Step 2: Keep only one reasoning for each solution
processed_dataset = []
for task_id, solution_map in task_solution_map.items():
for solution, reasoning_list in solution_map.items():
if len(reasoning_list) > 1:
selected_index = random.choice(range(1, len(reasoning_list)))
processed_dataset.append(reasoning_list[selected_index])
else:
processed_dataset.append(reasoning_list[0])
save_jsonl(processed_dataset, args.output.split('.')[0] + "-filtered.jsonl")
import argparse
from collections import defaultdict
import json
from functools import partial
from pathlib import Path
import codecritic.evaluation.metric as metric
from codecritic.utils.json import load_jsonl, save_jsonl
def eval(samples_path):
model, testset = samples_path.stem.split('-')[:2]
def f(item):
item["model"] = model
item["testset"] = testset
samples = load_jsonl(samples_path)
from codecritic.utils.json import load_jsonl
def eval(scores):
ks = list(range(1, 17))
results = []
results.append(metric.pass_at_k(samples, ks))
results.append(metric.top_at_k(samples, ks, metric.postive_and_negative))
results.append(metric.top_at_k(samples, ks, metric.positive_only))
# results.extend(metric.pass_at_k(scores, ks))
# results.extend(metric.pass_at_k(scores, [50]))
# results.extend(metric.top_at_k(scores, ks, metric.positive_only))
if "negative_score" in scores[0]:
results.extend(metric.top_at_k(scores, ks, metric.postive_and_negative))
for i in range(4):
threshold = 0.5 + i * 0.1
score_func = partial(metric.pos_neg_filter_uncertain, threshold=threshold)
results.append(metric.top_at_k(samples, ks, score_func))
# for i in range(4):
# threshold = 0.5 + i * 0.1
# score_func = partial(metric.pos_neg_filter_uncertain, threshold=threshold)
# results.extend(metric.top_at_k(scores, ks, score_func))
return list(map(f, results))
return results
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--sample_dir",
type=str,
default=None,
help="Path to the directory containing samples. If not provided, cached results will be used."
)
parser.add_argument("--out_dir", type=str, help="path/to/output_dir")
parser.add_argument(
"--score_func",
type=str,
default="all",
choices=["all", "posonly", "posneg", "posneg_filter"], # Add valid options
help="Select the scoring function to use. Default: 'all'."
)
parser.add_argument("--plot", type=str, help="path/to/plot")
parser.add_argument("--score", type=str, help="path/to/score")
args = parser.parse_args()
outdir = Path(args.out_dir)
if args.sample_dir:
for samples_path in Path(args.sample_dir).glob("*.jsonl"):
out_path = outdir / (samples_path.stem + "-eval.jsonl")
if not out_path.exists():
eval_results = eval(samples_path)
save_jsonl(eval_results, out_path)
for out_path in outdir.glob("*.jsonl"):
pass
\ No newline at end of file
scores = load_jsonl(args.score)
groups = defaultdict(list)
for item in scores:
groups[item["dataset"]].append(item)
for dataset, lst in groups.items():
results = eval(lst)
for r in results:
r["dataset"] = dataset
print(json.dumps(r))
import argparse
from collections import defaultdict
from functools import partial
import os
from transformers import AutoTokenizer
from vllm import SamplingParams
from codecritic.dataset.genrm_prompt import THINK_MESSAGE, JUDGE_MESSAGE, JUDGE_TOEKNS
from codecritic.dataset.genrm_prompt import JUDGE_MESSAGE, JUDGE_TOEKNS
from codecritic.utils.inference import generate_worker, score_worker
from codecritic.utils.parallel import model_map
from codecritic.utils.json import load_jsonl, save_jsonl
import codecritic.dataset.algolr_prompt as algolr_prompt
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="path/to/model")
parser.add_argument("--sample", type=str, help="path/to/sample")
parser.add_argument("--testset", type=str, help="path/to/testset")
parser.add_argument("--output", type=str, help="path/to/score")
parser.add_argument("--reasoning", action="store_true", help="enable reasoning")
parser.add_argument(
......@@ -31,11 +33,24 @@ if __name__ == "__main__":
os.environ["TOKENIZERS_PARALLELISM"] = "false"
tokenizer = AutoTokenizer.from_pretrained(args.model)
dataset = load_jsonl(args.sample)
dataset = load_jsonl(args.testset)
ds = defaultdict(list)
for item in dataset:
ds[item["task_id"]].append(item)
unsolvable = []
dataset = []
for task_id, items in ds.items():
if all([not x["pass"] for x in items]):
for item in items:
item["positive_score"] = 0
item["negative_score"] = 0
unsolvable.extend(items)
else:
dataset.extend(items)
if args.reasoning:
for item in dataset:
item["messages"].append(THINK_MESSAGE)
dataset = [algolr_prompt.process_to_reason_prompt(x, None) for x in dataset]
sampling_params = SamplingParams(
n=1,
......@@ -68,4 +83,4 @@ if __name__ == "__main__":
)
dataset = model_map(worker, dataset, args.tp)
save_jsonl(dataset, args.output)
save_jsonl(dataset + unsolvable, args.output)
import argparse
from collections import defaultdict
import json
import requests
from tqdm import tqdm
......@@ -21,18 +22,28 @@ def get_rewards_from_server(server_url: str, messages: list[str]):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="path/to/model")
parser.add_argument("--sample", type=str, help="path/to/sample")
parser.add_argument("--testset", type=str, help="path/to/testset")
parser.add_argument("--output", type=str, help="path/to/score")
args = parser.parse_args()
# compute score
dataset = load_jsonl(args.sample)
dataset = load_jsonl(args.testset)
ds = defaultdict(list)
for item in dataset:
ds[item["task_id"]].append(item)
for task_id, items in ds.items():
if all([not x["pass"] for x in items]):
for item in items:
item["positive_score"] = 0
server_url = "http://0.0.0.0:5000/get_reward"
tokenizer = AutoTokenizer.from_pretrained(args.model)
for item in tqdm(dataset):
query = tokenizer.apply_chat_template(item["messages"], tokenize=False)
score = get_rewards_from_server(server_url, [query])[0]
item["score"] = score
if 'positive_score' not in item:
query = tokenizer.apply_chat_template(item["messages"], tokenize=False)
score = get_rewards_from_server(server_url, [query])[0]
item["positive_score"] = score
save_jsonl(dataset, args.output)
import argparse
from functools import partial
import os
import pprint
from transformers import AutoTokenizer
from vllm import SamplingParams
from codecritic.dataset.genrm_prompt import JUDGE_MESSAGE, JUDGE_TOEKNS
from codecritic.utils.inference import generate_worker, score_worker
from codecritic.utils.parallel import model_map
from codecritic.utils.json import load_jsonl, save_jsonl
from codecritic.evaluation.metric import postive_and_negative, binary_metrics
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="path/to/model")
parser.add_argument("--trainset", type=str, help="path/to/testset")
parser.add_argument("--output", type=str, help="path/to/output")
parser.add_argument(
"--reason_max_tokens",
type=int,
default=4096,
help="maximum number of tokens allowed for the reasoning process.",
)
parser.add_argument(
"--tp", type=int, default=1, help="tensor parallel"
)
args = parser.parse_args()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
tokenizer = AutoTokenizer.from_pretrained(args.model)
dataset = load_jsonl(args.trainset)[:1000]
for item in dataset:
item["messages"] = item["question"]
item["pass"] = (item["response"][-1]["content"] == "Yes")
sampling_params = SamplingParams(
n=1,
temperature=0,
top_p=0.95,
max_tokens=args.reason_max_tokens,
)
worker = partial(
generate_worker, model_path=args.model, sampling_params=sampling_params
)
dataset = model_map(worker, dataset, args.tp)
def get_token_id(token):
score_tokens = tokenizer.encode(token, add_special_tokens=False)
assert len(score_tokens) == 1
return score_tokens[0]
positive_token = get_token_id(JUDGE_TOEKNS["positive"])
negative_token = get_token_id(JUDGE_TOEKNS["negative"])
for item in dataset:
item["messages"].append(JUDGE_MESSAGE)
worker = partial(
score_worker,
model_path=args.model,
positive_token=positive_token,
negative_token=negative_token,
)
dataset = model_map(worker, dataset, args.tp)
scores = [postive_and_negative(item) for item in dataset]
labels = [item["pass"] for item in dataset]
pprint.pp(binary_metrics(labels, scores))
save_jsonl(dataset, args.output)
......@@ -152,10 +152,7 @@ def remove_hint(item):
def extract_conclusion_and_code(response):
# Extract conclusion
if 'Conclusion:' not in response:
conclusion = None
print("not found conclusion\n{}".format(response))
else:
try:
conclusion_line = [line for line in response.split('\n') if line.startswith('Conclusion:')][0]
conclusion_str = conclusion_line.split(': ')[1].strip().lower()
......@@ -166,6 +163,9 @@ def extract_conclusion_and_code(response):
else:
print("llm doesn't draw to a conclusion\n{}".format(response))
conclusion = None
except Exception as e:
print("not found conclusion\n{}\n{}".format(response, e))
conclusion = None
# Extract corrected code if conclusion is 'No'
corrected_code = ""
......
......@@ -84,7 +84,7 @@ def evaluate_code_samples(code_samples, apps):
cpu_num = multiprocessing.cpu_count() // 2
chunksize = max(len(code_samples) // (cpu_num * 10), 1)
results = process_map(
test_generation, args, max_workers=cpu_num, chunksize=chunksize
test_generation, args, max_workers=cpu_num, chunksize=1
)
return results
......@@ -100,7 +100,7 @@ def evaluate(code_samples, apps):
The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result.
"""
all_results = []
for _ in range(2):
for _ in range(1):
results = evaluate_code_samples(code_samples, apps)
all_results.append(results)
......
# copy from codeparrot/apps_metric/testing_util.py
# https://huggingface.co/spaces/codeparrot/apps_metric/blob/main/testing_util.py
# Log: Replace pyext with importlib
# Log: Replace pyext with importlib
import json
import sys
......@@ -66,7 +66,7 @@ def run_test(sample, test=None, debug=False):
"""
# Disable functionalities that can make destructive changes to the test.
reliability_guard()
if debug:
print(f"start = {datetime.now().time()}")
......@@ -84,7 +84,7 @@ def run_test(sample, test=None, debug=False):
if debug:
print(f"loaded input_output = {datetime.now().time()}")
if test is None:
return in_outs
elif test is not None:
......@@ -92,7 +92,7 @@ def run_test(sample, test=None, debug=False):
sol = "import sys\nimport time\nimport itertools\nfrom itertools import accumulate, product, permutations, combinations\nimport collections\nfrom collections import Counter, OrderedDict, deque, defaultdict, ChainMap\nfrom functools import lru_cache\nimport math\nfrom math import sqrt, sin, cos, tan, ceil, fabs, floor, gcd, exp, log, log2\nimport fractions\nfrom typing import List, Tuple\nimport numpy as np\nimport random\nimport heapq\nfrom heapq import *\n"
if debug:
print(f"loading test code = {datetime.now().time()}")
if which_type == CODE_TYPE.call_based:
sol += test
if debug:
......@@ -124,7 +124,7 @@ def run_test(sample, test=None, debug=False):
else:
new_test.append(x + "\n")
tmp_test = new_test
new_test = ""
started = False
for i in tmp_test:
......@@ -133,7 +133,7 @@ def run_test(sample, test=None, debug=False):
new_test += "def code():\n"
new_test += i
started = True
elif started and ((i.startswith("from ")) or (i.startswith("import "))):
elif started and ((i.startswith("from ")) or (i.startswith("import "))):
new_test += "\t" + i
else:
new_test += i
......@@ -157,7 +157,7 @@ def run_test(sample, test=None, debug=False):
signal.alarm(0)
if debug:
print(f"get method = {datetime.now().time()}")
try:
method = getattr(tmp, method_name) # get_attr second arg must be str
except:
......@@ -196,7 +196,7 @@ def run_test(sample, test=None, debug=False):
# ground truth sequences are not tuples
if isinstance(output, tuple):
output = list(output)
tmp_result = output == in_outs["outputs"][index]
if isinstance(in_outs["outputs"][index], list) and in_outs["outputs"][index]:
tmp_result = tmp_result or (output == in_outs["outputs"][index][0])
......@@ -278,7 +278,7 @@ def run_test(sample, test=None, debug=False):
print(f"Failed check1 exception = {e}")
pass
if tmp_result == True:
if tmp_result == True:
results.append(tmp_result)
continue
......@@ -312,10 +312,10 @@ def run_test(sample, test=None, debug=False):
if debug:
nl = "\n"
if not isinstance(inputs, list):
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
else:
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
if tmp_result == True:
results.append(tmp_result)
continue
......@@ -350,9 +350,9 @@ def run_test(sample, test=None, debug=False):
# try by converting the stuff into split up list
if isinstance(in_outs["outputs"][index], list):
for tmp_index, i in enumerate(in_outs["outputs"][index]):
in_outs["outputs"][index][tmp_index] = set(i.split())
in_outs["outputs"][index][tmp_index] = list(i.split())
else:
in_outs["outputs"][index] = set(in_outs["outputs"][index].split())
in_outs["outputs"][index] = list(in_outs["outputs"][index].split())
try:
tmp_result = (output == in_outs["outputs"][index])
......@@ -363,7 +363,7 @@ def run_test(sample, test=None, debug=False):
if tmp_result == True:
results.append(tmp_result)
continue
continue
# try by converting the output into a split up list too
if isinstance(output, list):
......@@ -371,14 +371,14 @@ def run_test(sample, test=None, debug=False):
output[tmp_index] = i.split()
output = list(filter(len, output))
for tmp_index, i in enumerate(output):
output[tmp_index] = set(i)
output[tmp_index] = list(i)
else:
output = output.split()
output = list(filter(len, output))
output = set(output)
output = list(output)
try:
tmp_result = (set(frozenset(s) for s in output) == set(frozenset(s) for s in in_outs["outputs"][index]))
tmp_result = (list(list(s) for s in output) == list(list(s) for s in in_outs["outputs"][index]))
except Exception as e:
if debug:
print(f"Failed check5 exception = {e}")
......@@ -386,30 +386,30 @@ def run_test(sample, test=None, debug=False):
# if they are all numbers, round so that similar numbers are treated as identical
try:
tmp_result = tmp_result or (set(frozenset(round(float(t),3) for t in s) for s in output) ==\
set(frozenset(round(float(t),3) for t in s) for s in in_outs["outputs"][index]))
tmp_result = tmp_result or (list(list(round(float(t),3) for t in s) for s in output) ==\
list(list(round(float(t),3) for t in s) for s in in_outs["outputs"][index]))
except Exception as e:
if debug:
print(f"Failed check6 exception = {e}")
if tmp_result == True and debug:
print("PASSED")
results.append(tmp_result)
if debug:
nl = "\n"
if not isinstance(inputs, list):
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
else:
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
return results
def custom_compare_(output, ground_truth):
if isinstance(output, list):
output_1 = "\n".join(output)
if stripped_string_compare(output_1, ground_truth):
......@@ -451,7 +451,7 @@ def call_method(method, inputs):
pass
finally:
pass
return _inner_call_method(method)
return _inner_call_method(method)
......
......@@ -50,7 +50,10 @@ def positive_only(item):
def postive_and_negative(item):
pos = item["positive_score"]
neg = item["negative_score"]
return pos / (pos + neg)
if (pos + neg) == 0:
return 0
else:
return pos / (pos + neg)
def pos_neg_filter_uncertain(item, threshold):
......
......@@ -3,18 +3,21 @@ set -xe
model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/"
project="/lustre/S/nanziyuan/projects/ccc"
modelname="qwen25_coder_inst"
data="${project}/data"
trainset="${project}/data/train/${modelname}-apps-train.jsonl"
testset="${project}/data/test/${modelname}-apps-test.jsonl"
trainset="${data}/train/${modelname}-apps-train.jsonl"
testset="${data}/test/${modelname}-apps-test.jsonl"
train_selected_pairs="${project}/data/train/${modelname}-apps-train-selected_pairs.jsonl"
train_selected_pairs="${data}/train/${modelname}-apps-train-selected_pairs.jsonl"
apps="/lustre/S/nanziyuan/datasets/apps/"
sft="${project}/data/train/${modelname}-sft.jsonl"
sft="${data}/train/${modelname}-sft.jsonl"
ftmodel="${project}/model/qwen25_coder_inst_7b-algolr"
testset="${data}/test/qwen25_coder_inst-apps-test.jsonl"
evalresults="${data}/eval/qwen25_code_inst-apps-test-algolr-score.jsonl"
### export CUDA_VISIBLE_DEVICES=0,1,2,3
# export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m codecritic.cli.algolr \
......@@ -50,3 +53,11 @@ openrlhf.cli.train_sft \
--load_checkpoint \
--gradient_checkpointing \
--use_tensorboard "${ftmodel}_log"
python -m codecritic.cli.test_genrm \
--model ${ftmodel} \
--testset ${testset} \
--output ${evalresults} \
--reasoning \
--tp 1
......@@ -12,7 +12,7 @@ train_selected_pairs="${project}/data/train/${modelname}-apps-train-selected_pai
reward_ds="${project}/data/train/${modelname}-apps-train-reward_dataset.jsonl"
export CUDA_VISIBLE_DEVICES=0,1,2,3
# export CUDA_VISIBLE_DEVICES=0,1,2,3
## Sampling
## APPS
......
set -xe
model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/"
project="/lustre/S/nanziyuan/projects/ccc"
modelname="qwen25_coder_inst"
trainset="${project}/data/train/${modelname}-apps-train.jsonl"
testset="${project}/data/test/${modelname}-apps-test.jsonl"
train_selected_pairs="${project}/data/train/${modelname}-apps-train-selected_pairs.jsonl"
apps="/lustre/S/nanziyuan/datasets/apps/"
sft="${project}/data/train/${modelname}-sft.jsonl"
ftmodel="${project}/model/qwen25_coder_inst_7b-algolr"
testset="/lustre/S/nanziyuan/projects/ccc/data/test/qwen25_coder_inst-apps-test.jsonl"
evalresults="/lustre/S/nanziyuan/projects/ccc/data/eval/qwen25_code_inst-apps-test-genrm-score.jsonl"
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# python -m codecritic.cli.algolr \
# --model ${model} \
# --dataset ${trainset} \
# --pairinfo ${train_selected_pairs} \
# --apps ${apps} \
# --output ${sft} \
# --level beginner \
# --tp 1
# deepspeed --module \
# openrlhf.cli.train_sft \
# --max_len 4096 \
# --dataset ${sft} \
# --input_key question \
# --output_key response \
# --apply_chat_template \
# --train_batch_size 256 \
# --micro_train_batch_size 2 \
# --max_samples 500000 \
# --pretrain ${model} \
# --save_path ${ftmodel} \
# --save_steps -1 \
# --logging_steps 1 \
# --eval_steps -1 \
# --zero_stage 2 \
# --max_epochs 1 \
# --bf16 \
# --flash_attn \
# --learning_rate 5e-6 \
# --load_checkpoint \
# --gradient_checkpointing \
# --use_tensorboard "${ftmodel}_log"
python -m codecritic.cli.test_genrm \
--model ${model} \
--testset ${testset} \
--output ${evalresults} \
--reasoning \
--tp 1
......@@ -6,6 +6,9 @@ project="/lustre/S/nanziyuan/projects/ccc"
model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/"
ftmodel="${project}/model/qwen25_coder_inst_7b-orm"
testset="/lustre/S/nanziyuan/projects/ccc/data/test/qwen25_coder_inst-apps-test.jsonl"
evalresults="/lustre/S/nanziyuan/projects/ccc/data/eval/qwen25_code_inst-apps-test-orm-score.jsonl"
deepspeed --module \
openrlhf.cli.train_rm \
--save_path ${ftmodel} \
......@@ -31,41 +34,41 @@ openrlhf.cli.train_rm \
--use_tensorboard "${ftmodel}_log"
# start_server() {
# echo "Starting server..."
# CUDA_VISIBLE_DEVICES=0 \
# python -m openrlhf.cli.serve_rm \
# --reward_pretrain ${model} \
# --normalize_reward \
# --port 5000 \
# --bf16 \
# --max_len 8192 &
# SERVER_PID=$!
# echo "Server started with PID: $SERVER_PID"
# }
start_server() {
echo "Starting server..."
CUDA_VISIBLE_DEVICES=0 \
python -m openrlhf.cli.serve_rm \
--reward_pretrain ${ftmodel} \
--normalize_reward \
--port 5000 \
--bf16 \
--max_len 8192 &
SERVER_PID=$!
echo "Server started with PID: $SERVER_PID"
}
# # Function to start the client
# start_client() {
# echo "Starting client..."
# python -m codecritic.cli.run_rm_test \
# --model ${model} \
# --test "${datasets}/sample/min_test.jsonl" \
# --apps /lustre/S/nanziyuan/datasets/apps/
# CLIENT_EXIT_CODE=$?
# echo "Client finished with exit code: $CLIENT_EXIT_CODE"
# }
# Function to start the client
start_client() {
echo "Starting client..."
python -m codecritic.cli.test_orm \
--model ${ftmodel} \
--testset ${testset} \
--output ${evalresults}
CLIENT_EXIT_CODE=$?
echo "Client finished with exit code: $CLIENT_EXIT_CODE"
}
# # Function to stop the server
# stop_server() {
# echo "Stopping server..."
# kill -SIGINT $SERVER_PID
# wait $SERVER_PID 2>/dev/null
# echo "Server stopped."
# }
# Function to stop the server
stop_server() {
echo "Stopping server..."
kill -SIGINT $SERVER_PID
wait $SERVER_PID 2>/dev/null
echo "Server stopped."
}
# start_server
# # Give the server some time to initialize (optional)
# sleep 60
# start_client
# stop_server
# echo "Execution complete."
start_server
# Give the server some time to initialize (optional)
sleep 60
start_client
stop_server
echo "Execution complete."
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment