Commit 36054b07 by nzy

Merge branch 'main' of http://62.234.201.16/nzy/codecritic

parents 030e1e12 2a43e44e
import argparse import argparse
from collections import defaultdict from collections import defaultdict
from functools import partial from functools import partial
import pprint
import random
from vllm import SamplingParams from vllm import SamplingParams
from datasets import load_dataset from datasets import load_dataset
...@@ -51,11 +53,14 @@ if __name__ == "__main__": ...@@ -51,11 +53,14 @@ if __name__ == "__main__":
worker = partial(generate_worker, model_path=args.model, sampling_params=sampling_params) worker = partial(generate_worker, model_path=args.model, sampling_params=sampling_params)
hint_responses = model_map(worker, hint_prompts, args.tp) hint_responses = model_map(worker, hint_prompts, args.tp)
pprint.pp(hint_responses[0])
hints = [promptlib.postprocess_to_hint(x) for x in hint_responses] hints = [promptlib.postprocess_to_hint(x) for x in hint_responses]
# hints: {"dataset"..., "task_id": ..., "solution_id": ..., "hints": ...} # hints: {"dataset"..., "task_id": ..., "solution_id": ..., "hints": ...}
# save_jsonl(hint_responses, args.output + ".hint_responses")
save_jsonl(hints, args.output + ".hints") save_jsonl(hints, args.output + ".hints")
# hints = load_jsonl(args.output + ".hints")
hints_dict = defaultdict(dict) hints_dict = defaultdict(dict)
for item in hints: for item in hints:
...@@ -73,7 +78,7 @@ if __name__ == "__main__": ...@@ -73,7 +78,7 @@ if __name__ == "__main__":
reason_prompts.append(chosen_prompt) reason_prompts.append(chosen_prompt)
# rejected # rejected
rejected_hints = hints_dict[task_id][rejected_id] rejected_hints = hints_dict[task_id][rejected_id]["hint"]
rejected_prompt = promptlib.process_to_reason_prompt(rejected, rejected_hints) rejected_prompt = promptlib.process_to_reason_prompt(rejected, rejected_hints)
reason_prompts.append(rejected_prompt) reason_prompts.append(rejected_prompt)
...@@ -86,6 +91,8 @@ if __name__ == "__main__": ...@@ -86,6 +91,8 @@ if __name__ == "__main__":
worker = partial(generate_worker, model_path=args.model, sampling_params=sampling_params) worker = partial(generate_worker, model_path=args.model, sampling_params=sampling_params)
reason_responses = model_map(worker, reason_prompts, args.tp) reason_responses = model_map(worker, reason_prompts, args.tp)
pprint.pp(reason_responses[0])
save_jsonl(reason_responses, args.output + ".reason")
# Step3 Verify reasoning results # Step3 Verify reasoning results
# add prompt "correct the code based the reasoning" # add prompt "correct the code based the reasoning"
...@@ -114,6 +121,7 @@ if __name__ == "__main__": ...@@ -114,6 +121,7 @@ if __name__ == "__main__":
worker = partial(generate_worker, model_path=args.model, sampling_params=sampling_params) worker = partial(generate_worker, model_path=args.model, sampling_params=sampling_params)
verify_responses = model_map(worker, reason_responses, args.tp) verify_responses = model_map(worker, reason_responses, args.tp)
pprint.pp(verify_responses[0])
print("verify response size: {}".format(len(verify_responses))) print("verify response size: {}".format(len(verify_responses)))
# postprocess verify_response. # postprocess verify_response.
...@@ -148,8 +156,7 @@ if __name__ == "__main__": ...@@ -148,8 +156,7 @@ if __name__ == "__main__":
print("Corrects (execution consistent) size: {}".format(len(corrects))) print("Corrects (execution consistent) size: {}".format(len(corrects)))
print("Incorrects (execution consistent) size: {}".format(len(incorrects))) print("Incorrects (execution consistent) size: {}".format(len(incorrects)))
# Step4 Remove hints and Reformat to a SFT dataset # Step4 Reformat to a SFT dataset
# extract reasoning sets
sft = [] sft = []
for item in verify_passed: for item in verify_passed:
...@@ -169,4 +176,26 @@ if __name__ == "__main__": ...@@ -169,4 +176,26 @@ if __name__ == "__main__":
sft.append(line) sft.append(line)
print("Size of sft dataset: {}".format(len(sft))) print("Size of sft dataset: {}".format(len(sft)))
pprint.pp(sft[0])
save_jsonl(sft, args.output) save_jsonl(sft, args.output)
# Step5 keep 1 rationale for 1 solution
task_solution_map = defaultdict(lambda: defaultdict(list))
for entry in sft:
task_id = entry["task_id"]
solution_id = entry["solution_id"]
task_solution_map[task_id][solution_id.split("_")[0]].append(entry)
# Step 2: Keep only one reasoning for each solution
processed_dataset = []
for task_id, solution_map in task_solution_map.items():
for solution, reasoning_list in solution_map.items():
if len(reasoning_list) > 1:
selected_index = random.choice(range(1, len(reasoning_list)))
processed_dataset.append(reasoning_list[selected_index])
else:
processed_dataset.append(reasoning_list[0])
save_jsonl(processed_dataset, args.output.split('.')[0] + "-filtered.jsonl")
import argparse import argparse
from collections import defaultdict
import json
from functools import partial from functools import partial
from pathlib import Path
import codecritic.evaluation.metric as metric import codecritic.evaluation.metric as metric
from codecritic.utils.json import load_jsonl, save_jsonl from codecritic.utils.json import load_jsonl
def eval(samples_path):
model, testset = samples_path.stem.split('-')[:2]
def f(item):
item["model"] = model
item["testset"] = testset
samples = load_jsonl(samples_path)
def eval(scores):
ks = list(range(1, 17)) ks = list(range(1, 17))
results = [] results = []
results.append(metric.pass_at_k(samples, ks)) # results.extend(metric.pass_at_k(scores, ks))
results.append(metric.top_at_k(samples, ks, metric.postive_and_negative)) # results.extend(metric.pass_at_k(scores, [50]))
results.append(metric.top_at_k(samples, ks, metric.positive_only)) # results.extend(metric.top_at_k(scores, ks, metric.positive_only))
if "negative_score" in scores[0]:
results.extend(metric.top_at_k(scores, ks, metric.postive_and_negative))
for i in range(4): # for i in range(4):
threshold = 0.5 + i * 0.1 # threshold = 0.5 + i * 0.1
score_func = partial(metric.pos_neg_filter_uncertain, threshold=threshold) # score_func = partial(metric.pos_neg_filter_uncertain, threshold=threshold)
results.append(metric.top_at_k(samples, ks, score_func)) # results.extend(metric.top_at_k(scores, ks, score_func))
return list(map(f, results)) return results
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument("--score", type=str, help="path/to/score")
"--sample_dir",
type=str,
default=None,
help="Path to the directory containing samples. If not provided, cached results will be used."
)
parser.add_argument("--out_dir", type=str, help="path/to/output_dir")
parser.add_argument(
"--score_func",
type=str,
default="all",
choices=["all", "posonly", "posneg", "posneg_filter"], # Add valid options
help="Select the scoring function to use. Default: 'all'."
)
parser.add_argument("--plot", type=str, help="path/to/plot")
args = parser.parse_args() args = parser.parse_args()
outdir = Path(args.out_dir) scores = load_jsonl(args.score)
if args.sample_dir: groups = defaultdict(list)
for samples_path in Path(args.sample_dir).glob("*.jsonl"): for item in scores:
out_path = outdir / (samples_path.stem + "-eval.jsonl") groups[item["dataset"]].append(item)
if not out_path.exists():
eval_results = eval(samples_path) for dataset, lst in groups.items():
save_jsonl(eval_results, out_path) results = eval(lst)
for r in results:
for out_path in outdir.glob("*.jsonl"): r["dataset"] = dataset
pass print(json.dumps(r))
\ No newline at end of file
import argparse import argparse
from collections import defaultdict
from functools import partial from functools import partial
import os import os
from transformers import AutoTokenizer from transformers import AutoTokenizer
from vllm import SamplingParams from vllm import SamplingParams
from codecritic.dataset.genrm_prompt import THINK_MESSAGE, JUDGE_MESSAGE, JUDGE_TOEKNS from codecritic.dataset.genrm_prompt import JUDGE_MESSAGE, JUDGE_TOEKNS
from codecritic.utils.inference import generate_worker, score_worker from codecritic.utils.inference import generate_worker, score_worker
from codecritic.utils.parallel import model_map from codecritic.utils.parallel import model_map
from codecritic.utils.json import load_jsonl, save_jsonl from codecritic.utils.json import load_jsonl, save_jsonl
import codecritic.dataset.algolr_prompt as algolr_prompt
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="path/to/model") parser.add_argument("--model", type=str, help="path/to/model")
parser.add_argument("--sample", type=str, help="path/to/sample") parser.add_argument("--testset", type=str, help="path/to/testset")
parser.add_argument("--output", type=str, help="path/to/score") parser.add_argument("--output", type=str, help="path/to/score")
parser.add_argument("--reasoning", action="store_true", help="enable reasoning") parser.add_argument("--reasoning", action="store_true", help="enable reasoning")
parser.add_argument( parser.add_argument(
...@@ -31,11 +33,24 @@ if __name__ == "__main__": ...@@ -31,11 +33,24 @@ if __name__ == "__main__":
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
tokenizer = AutoTokenizer.from_pretrained(args.model) tokenizer = AutoTokenizer.from_pretrained(args.model)
dataset = load_jsonl(args.sample) dataset = load_jsonl(args.testset)
ds = defaultdict(list)
for item in dataset:
ds[item["task_id"]].append(item)
unsolvable = []
dataset = []
for task_id, items in ds.items():
if all([not x["pass"] for x in items]):
for item in items:
item["positive_score"] = 0
item["negative_score"] = 0
unsolvable.extend(items)
else:
dataset.extend(items)
if args.reasoning: if args.reasoning:
for item in dataset: dataset = [algolr_prompt.process_to_reason_prompt(x, None) for x in dataset]
item["messages"].append(THINK_MESSAGE)
sampling_params = SamplingParams( sampling_params = SamplingParams(
n=1, n=1,
...@@ -68,4 +83,4 @@ if __name__ == "__main__": ...@@ -68,4 +83,4 @@ if __name__ == "__main__":
) )
dataset = model_map(worker, dataset, args.tp) dataset = model_map(worker, dataset, args.tp)
save_jsonl(dataset, args.output) save_jsonl(dataset + unsolvable, args.output)
import argparse import argparse
from collections import defaultdict
import json import json
import requests import requests
from tqdm import tqdm from tqdm import tqdm
...@@ -21,18 +22,28 @@ def get_rewards_from_server(server_url: str, messages: list[str]): ...@@ -21,18 +22,28 @@ def get_rewards_from_server(server_url: str, messages: list[str]):
if __name__ == "__main__": if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="path/to/model") parser.add_argument("--model", type=str, help="path/to/model")
parser.add_argument("--sample", type=str, help="path/to/sample") parser.add_argument("--testset", type=str, help="path/to/testset")
parser.add_argument("--output", type=str, help="path/to/score") parser.add_argument("--output", type=str, help="path/to/score")
args = parser.parse_args() args = parser.parse_args()
# compute score # compute score
dataset = load_jsonl(args.sample) dataset = load_jsonl(args.testset)
ds = defaultdict(list)
for item in dataset:
ds[item["task_id"]].append(item)
for task_id, items in ds.items():
if all([not x["pass"] for x in items]):
for item in items:
item["positive_score"] = 0
server_url = "http://0.0.0.0:5000/get_reward" server_url = "http://0.0.0.0:5000/get_reward"
tokenizer = AutoTokenizer.from_pretrained(args.model) tokenizer = AutoTokenizer.from_pretrained(args.model)
for item in tqdm(dataset): for item in tqdm(dataset):
query = tokenizer.apply_chat_template(item["messages"], tokenize=False) if 'positive_score' not in item:
score = get_rewards_from_server(server_url, [query])[0] query = tokenizer.apply_chat_template(item["messages"], tokenize=False)
item["score"] = score score = get_rewards_from_server(server_url, [query])[0]
item["positive_score"] = score
save_jsonl(dataset, args.output) save_jsonl(dataset, args.output)
import argparse
from functools import partial
import os
import pprint
from transformers import AutoTokenizer
from vllm import SamplingParams
from codecritic.dataset.genrm_prompt import JUDGE_MESSAGE, JUDGE_TOEKNS
from codecritic.utils.inference import generate_worker, score_worker
from codecritic.utils.parallel import model_map
from codecritic.utils.json import load_jsonl, save_jsonl
from codecritic.evaluation.metric import postive_and_negative, binary_metrics
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str, help="path/to/model")
parser.add_argument("--trainset", type=str, help="path/to/testset")
parser.add_argument("--output", type=str, help="path/to/output")
parser.add_argument(
"--reason_max_tokens",
type=int,
default=4096,
help="maximum number of tokens allowed for the reasoning process.",
)
parser.add_argument(
"--tp", type=int, default=1, help="tensor parallel"
)
args = parser.parse_args()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
tokenizer = AutoTokenizer.from_pretrained(args.model)
dataset = load_jsonl(args.trainset)[:1000]
for item in dataset:
item["messages"] = item["question"]
item["pass"] = (item["response"][-1]["content"] == "Yes")
sampling_params = SamplingParams(
n=1,
temperature=0,
top_p=0.95,
max_tokens=args.reason_max_tokens,
)
worker = partial(
generate_worker, model_path=args.model, sampling_params=sampling_params
)
dataset = model_map(worker, dataset, args.tp)
def get_token_id(token):
score_tokens = tokenizer.encode(token, add_special_tokens=False)
assert len(score_tokens) == 1
return score_tokens[0]
positive_token = get_token_id(JUDGE_TOEKNS["positive"])
negative_token = get_token_id(JUDGE_TOEKNS["negative"])
for item in dataset:
item["messages"].append(JUDGE_MESSAGE)
worker = partial(
score_worker,
model_path=args.model,
positive_token=positive_token,
negative_token=negative_token,
)
dataset = model_map(worker, dataset, args.tp)
scores = [postive_and_negative(item) for item in dataset]
labels = [item["pass"] for item in dataset]
pprint.pp(binary_metrics(labels, scores))
save_jsonl(dataset, args.output)
...@@ -152,10 +152,7 @@ def remove_hint(item): ...@@ -152,10 +152,7 @@ def remove_hint(item):
def extract_conclusion_and_code(response): def extract_conclusion_and_code(response):
# Extract conclusion # Extract conclusion
if 'Conclusion:' not in response: try:
conclusion = None
print("not found conclusion\n{}".format(response))
else:
conclusion_line = [line for line in response.split('\n') if line.startswith('Conclusion:')][0] conclusion_line = [line for line in response.split('\n') if line.startswith('Conclusion:')][0]
conclusion_str = conclusion_line.split(': ')[1].strip().lower() conclusion_str = conclusion_line.split(': ')[1].strip().lower()
...@@ -166,6 +163,9 @@ def extract_conclusion_and_code(response): ...@@ -166,6 +163,9 @@ def extract_conclusion_and_code(response):
else: else:
print("llm doesn't draw to a conclusion\n{}".format(response)) print("llm doesn't draw to a conclusion\n{}".format(response))
conclusion = None conclusion = None
except Exception as e:
print("not found conclusion\n{}\n{}".format(response, e))
conclusion = None
# Extract corrected code if conclusion is 'No' # Extract corrected code if conclusion is 'No'
corrected_code = "" corrected_code = ""
......
...@@ -84,7 +84,7 @@ def evaluate_code_samples(code_samples, apps): ...@@ -84,7 +84,7 @@ def evaluate_code_samples(code_samples, apps):
cpu_num = multiprocessing.cpu_count() // 2 cpu_num = multiprocessing.cpu_count() // 2
chunksize = max(len(code_samples) // (cpu_num * 10), 1) chunksize = max(len(code_samples) // (cpu_num * 10), 1)
results = process_map( results = process_map(
test_generation, args, max_workers=cpu_num, chunksize=chunksize test_generation, args, max_workers=cpu_num, chunksize=1
) )
return results return results
...@@ -100,7 +100,7 @@ def evaluate(code_samples, apps): ...@@ -100,7 +100,7 @@ def evaluate(code_samples, apps):
The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result. The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result.
""" """
all_results = [] all_results = []
for _ in range(2): for _ in range(1):
results = evaluate_code_samples(code_samples, apps) results = evaluate_code_samples(code_samples, apps)
all_results.append(results) all_results.append(results)
......
# copy from codeparrot/apps_metric/testing_util.py # copy from codeparrot/apps_metric/testing_util.py
# https://huggingface.co/spaces/codeparrot/apps_metric/blob/main/testing_util.py # https://huggingface.co/spaces/codeparrot/apps_metric/blob/main/testing_util.py
# Log: Replace pyext with importlib # Log: Replace pyext with importlib
import json import json
import sys import sys
...@@ -66,7 +66,7 @@ def run_test(sample, test=None, debug=False): ...@@ -66,7 +66,7 @@ def run_test(sample, test=None, debug=False):
""" """
# Disable functionalities that can make destructive changes to the test. # Disable functionalities that can make destructive changes to the test.
reliability_guard() reliability_guard()
if debug: if debug:
print(f"start = {datetime.now().time()}") print(f"start = {datetime.now().time()}")
...@@ -84,7 +84,7 @@ def run_test(sample, test=None, debug=False): ...@@ -84,7 +84,7 @@ def run_test(sample, test=None, debug=False):
if debug: if debug:
print(f"loaded input_output = {datetime.now().time()}") print(f"loaded input_output = {datetime.now().time()}")
if test is None: if test is None:
return in_outs return in_outs
elif test is not None: elif test is not None:
...@@ -92,7 +92,7 @@ def run_test(sample, test=None, debug=False): ...@@ -92,7 +92,7 @@ def run_test(sample, test=None, debug=False):
sol = "import sys\nimport time\nimport itertools\nfrom itertools import accumulate, product, permutations, combinations\nimport collections\nfrom collections import Counter, OrderedDict, deque, defaultdict, ChainMap\nfrom functools import lru_cache\nimport math\nfrom math import sqrt, sin, cos, tan, ceil, fabs, floor, gcd, exp, log, log2\nimport fractions\nfrom typing import List, Tuple\nimport numpy as np\nimport random\nimport heapq\nfrom heapq import *\n" sol = "import sys\nimport time\nimport itertools\nfrom itertools import accumulate, product, permutations, combinations\nimport collections\nfrom collections import Counter, OrderedDict, deque, defaultdict, ChainMap\nfrom functools import lru_cache\nimport math\nfrom math import sqrt, sin, cos, tan, ceil, fabs, floor, gcd, exp, log, log2\nimport fractions\nfrom typing import List, Tuple\nimport numpy as np\nimport random\nimport heapq\nfrom heapq import *\n"
if debug: if debug:
print(f"loading test code = {datetime.now().time()}") print(f"loading test code = {datetime.now().time()}")
if which_type == CODE_TYPE.call_based: if which_type == CODE_TYPE.call_based:
sol += test sol += test
if debug: if debug:
...@@ -124,7 +124,7 @@ def run_test(sample, test=None, debug=False): ...@@ -124,7 +124,7 @@ def run_test(sample, test=None, debug=False):
else: else:
new_test.append(x + "\n") new_test.append(x + "\n")
tmp_test = new_test tmp_test = new_test
new_test = "" new_test = ""
started = False started = False
for i in tmp_test: for i in tmp_test:
...@@ -133,7 +133,7 @@ def run_test(sample, test=None, debug=False): ...@@ -133,7 +133,7 @@ def run_test(sample, test=None, debug=False):
new_test += "def code():\n" new_test += "def code():\n"
new_test += i new_test += i
started = True started = True
elif started and ((i.startswith("from ")) or (i.startswith("import "))): elif started and ((i.startswith("from ")) or (i.startswith("import "))):
new_test += "\t" + i new_test += "\t" + i
else: else:
new_test += i new_test += i
...@@ -157,7 +157,7 @@ def run_test(sample, test=None, debug=False): ...@@ -157,7 +157,7 @@ def run_test(sample, test=None, debug=False):
signal.alarm(0) signal.alarm(0)
if debug: if debug:
print(f"get method = {datetime.now().time()}") print(f"get method = {datetime.now().time()}")
try: try:
method = getattr(tmp, method_name) # get_attr second arg must be str method = getattr(tmp, method_name) # get_attr second arg must be str
except: except:
...@@ -196,7 +196,7 @@ def run_test(sample, test=None, debug=False): ...@@ -196,7 +196,7 @@ def run_test(sample, test=None, debug=False):
# ground truth sequences are not tuples # ground truth sequences are not tuples
if isinstance(output, tuple): if isinstance(output, tuple):
output = list(output) output = list(output)
tmp_result = output == in_outs["outputs"][index] tmp_result = output == in_outs["outputs"][index]
if isinstance(in_outs["outputs"][index], list) and in_outs["outputs"][index]: if isinstance(in_outs["outputs"][index], list) and in_outs["outputs"][index]:
tmp_result = tmp_result or (output == in_outs["outputs"][index][0]) tmp_result = tmp_result or (output == in_outs["outputs"][index][0])
...@@ -278,7 +278,7 @@ def run_test(sample, test=None, debug=False): ...@@ -278,7 +278,7 @@ def run_test(sample, test=None, debug=False):
print(f"Failed check1 exception = {e}") print(f"Failed check1 exception = {e}")
pass pass
if tmp_result == True: if tmp_result == True:
results.append(tmp_result) results.append(tmp_result)
continue continue
...@@ -312,10 +312,10 @@ def run_test(sample, test=None, debug=False): ...@@ -312,10 +312,10 @@ def run_test(sample, test=None, debug=False):
if debug: if debug:
nl = "\n" nl = "\n"
if not isinstance(inputs, list): if not isinstance(inputs, list):
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}") print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
else: else:
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}") print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
if tmp_result == True: if tmp_result == True:
results.append(tmp_result) results.append(tmp_result)
continue continue
...@@ -350,9 +350,9 @@ def run_test(sample, test=None, debug=False): ...@@ -350,9 +350,9 @@ def run_test(sample, test=None, debug=False):
# try by converting the stuff into split up list # try by converting the stuff into split up list
if isinstance(in_outs["outputs"][index], list): if isinstance(in_outs["outputs"][index], list):
for tmp_index, i in enumerate(in_outs["outputs"][index]): for tmp_index, i in enumerate(in_outs["outputs"][index]):
in_outs["outputs"][index][tmp_index] = set(i.split()) in_outs["outputs"][index][tmp_index] = list(i.split())
else: else:
in_outs["outputs"][index] = set(in_outs["outputs"][index].split()) in_outs["outputs"][index] = list(in_outs["outputs"][index].split())
try: try:
tmp_result = (output == in_outs["outputs"][index]) tmp_result = (output == in_outs["outputs"][index])
...@@ -363,7 +363,7 @@ def run_test(sample, test=None, debug=False): ...@@ -363,7 +363,7 @@ def run_test(sample, test=None, debug=False):
if tmp_result == True: if tmp_result == True:
results.append(tmp_result) results.append(tmp_result)
continue continue
# try by converting the output into a split up list too # try by converting the output into a split up list too
if isinstance(output, list): if isinstance(output, list):
...@@ -371,14 +371,14 @@ def run_test(sample, test=None, debug=False): ...@@ -371,14 +371,14 @@ def run_test(sample, test=None, debug=False):
output[tmp_index] = i.split() output[tmp_index] = i.split()
output = list(filter(len, output)) output = list(filter(len, output))
for tmp_index, i in enumerate(output): for tmp_index, i in enumerate(output):
output[tmp_index] = set(i) output[tmp_index] = list(i)
else: else:
output = output.split() output = output.split()
output = list(filter(len, output)) output = list(filter(len, output))
output = set(output) output = list(output)
try: try:
tmp_result = (set(frozenset(s) for s in output) == set(frozenset(s) for s in in_outs["outputs"][index])) tmp_result = (list(list(s) for s in output) == list(list(s) for s in in_outs["outputs"][index]))
except Exception as e: except Exception as e:
if debug: if debug:
print(f"Failed check5 exception = {e}") print(f"Failed check5 exception = {e}")
...@@ -386,30 +386,30 @@ def run_test(sample, test=None, debug=False): ...@@ -386,30 +386,30 @@ def run_test(sample, test=None, debug=False):
# if they are all numbers, round so that similar numbers are treated as identical # if they are all numbers, round so that similar numbers are treated as identical
try: try:
tmp_result = tmp_result or (set(frozenset(round(float(t),3) for t in s) for s in output) ==\ tmp_result = tmp_result or (list(list(round(float(t),3) for t in s) for s in output) ==\
set(frozenset(round(float(t),3) for t in s) for s in in_outs["outputs"][index])) list(list(round(float(t),3) for t in s) for s in in_outs["outputs"][index]))
except Exception as e: except Exception as e:
if debug: if debug:
print(f"Failed check6 exception = {e}") print(f"Failed check6 exception = {e}")
if tmp_result == True and debug: if tmp_result == True and debug:
print("PASSED") print("PASSED")
results.append(tmp_result) results.append(tmp_result)
if debug: if debug:
nl = "\n" nl = "\n"
if not isinstance(inputs, list): if not isinstance(inputs, list):
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}") print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs.replace(nl,' new-line ')}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
else: else:
print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}") print(f"output = {output}, test outputs = {in_outs['outputs'][index]}, inputs = {inputs}, {type(inputs)}, {output == [in_outs['outputs'][index]]}")
return results return results
def custom_compare_(output, ground_truth): def custom_compare_(output, ground_truth):
if isinstance(output, list): if isinstance(output, list):
output_1 = "\n".join(output) output_1 = "\n".join(output)
if stripped_string_compare(output_1, ground_truth): if stripped_string_compare(output_1, ground_truth):
...@@ -451,7 +451,7 @@ def call_method(method, inputs): ...@@ -451,7 +451,7 @@ def call_method(method, inputs):
pass pass
finally: finally:
pass pass
return _inner_call_method(method) return _inner_call_method(method)
......
...@@ -50,7 +50,10 @@ def positive_only(item): ...@@ -50,7 +50,10 @@ def positive_only(item):
def postive_and_negative(item): def postive_and_negative(item):
pos = item["positive_score"] pos = item["positive_score"]
neg = item["negative_score"] neg = item["negative_score"]
return pos / (pos + neg) if (pos + neg) == 0:
return 0
else:
return pos / (pos + neg)
def pos_neg_filter_uncertain(item, threshold): def pos_neg_filter_uncertain(item, threshold):
......
...@@ -3,18 +3,21 @@ set -xe ...@@ -3,18 +3,21 @@ set -xe
model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/" model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/"
project="/lustre/S/nanziyuan/projects/ccc" project="/lustre/S/nanziyuan/projects/ccc"
modelname="qwen25_coder_inst" modelname="qwen25_coder_inst"
data="${project}/data"
trainset="${project}/data/train/${modelname}-apps-train.jsonl" trainset="${data}/train/${modelname}-apps-train.jsonl"
testset="${project}/data/test/${modelname}-apps-test.jsonl" testset="${data}/test/${modelname}-apps-test.jsonl"
train_selected_pairs="${project}/data/train/${modelname}-apps-train-selected_pairs.jsonl" train_selected_pairs="${data}/train/${modelname}-apps-train-selected_pairs.jsonl"
apps="/lustre/S/nanziyuan/datasets/apps/" apps="/lustre/S/nanziyuan/datasets/apps/"
sft="${project}/data/train/${modelname}-sft.jsonl" sft="${data}/train/${modelname}-sft.jsonl"
ftmodel="${project}/model/qwen25_coder_inst_7b-algolr" ftmodel="${project}/model/qwen25_coder_inst_7b-algolr"
testset="${data}/test/qwen25_coder_inst-apps-test.jsonl"
evalresults="${data}/eval/qwen25_code_inst-apps-test-algolr-score.jsonl"
### export CUDA_VISIBLE_DEVICES=0,1,2,3 # export CUDA_VISIBLE_DEVICES=0,1,2,3
python -m codecritic.cli.algolr \ python -m codecritic.cli.algolr \
...@@ -50,3 +53,11 @@ openrlhf.cli.train_sft \ ...@@ -50,3 +53,11 @@ openrlhf.cli.train_sft \
--load_checkpoint \ --load_checkpoint \
--gradient_checkpointing \ --gradient_checkpointing \
--use_tensorboard "${ftmodel}_log" --use_tensorboard "${ftmodel}_log"
python -m codecritic.cli.test_genrm \
--model ${ftmodel} \
--testset ${testset} \
--output ${evalresults} \
--reasoning \
--tp 1
...@@ -12,7 +12,7 @@ train_selected_pairs="${project}/data/train/${modelname}-apps-train-selected_pai ...@@ -12,7 +12,7 @@ train_selected_pairs="${project}/data/train/${modelname}-apps-train-selected_pai
reward_ds="${project}/data/train/${modelname}-apps-train-reward_dataset.jsonl" reward_ds="${project}/data/train/${modelname}-apps-train-reward_dataset.jsonl"
export CUDA_VISIBLE_DEVICES=0,1,2,3 # export CUDA_VISIBLE_DEVICES=0,1,2,3
## Sampling ## Sampling
## APPS ## APPS
......
set -xe
model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/"
project="/lustre/S/nanziyuan/projects/ccc"
modelname="qwen25_coder_inst"
trainset="${project}/data/train/${modelname}-apps-train.jsonl"
testset="${project}/data/test/${modelname}-apps-test.jsonl"
train_selected_pairs="${project}/data/train/${modelname}-apps-train-selected_pairs.jsonl"
apps="/lustre/S/nanziyuan/datasets/apps/"
sft="${project}/data/train/${modelname}-sft.jsonl"
ftmodel="${project}/model/qwen25_coder_inst_7b-algolr"
testset="/lustre/S/nanziyuan/projects/ccc/data/test/qwen25_coder_inst-apps-test.jsonl"
evalresults="/lustre/S/nanziyuan/projects/ccc/data/eval/qwen25_code_inst-apps-test-genrm-score.jsonl"
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
# python -m codecritic.cli.algolr \
# --model ${model} \
# --dataset ${trainset} \
# --pairinfo ${train_selected_pairs} \
# --apps ${apps} \
# --output ${sft} \
# --level beginner \
# --tp 1
# deepspeed --module \
# openrlhf.cli.train_sft \
# --max_len 4096 \
# --dataset ${sft} \
# --input_key question \
# --output_key response \
# --apply_chat_template \
# --train_batch_size 256 \
# --micro_train_batch_size 2 \
# --max_samples 500000 \
# --pretrain ${model} \
# --save_path ${ftmodel} \
# --save_steps -1 \
# --logging_steps 1 \
# --eval_steps -1 \
# --zero_stage 2 \
# --max_epochs 1 \
# --bf16 \
# --flash_attn \
# --learning_rate 5e-6 \
# --load_checkpoint \
# --gradient_checkpointing \
# --use_tensorboard "${ftmodel}_log"
python -m codecritic.cli.test_genrm \
--model ${model} \
--testset ${testset} \
--output ${evalresults} \
--reasoning \
--tp 1
...@@ -6,6 +6,9 @@ project="/lustre/S/nanziyuan/projects/ccc" ...@@ -6,6 +6,9 @@ project="/lustre/S/nanziyuan/projects/ccc"
model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/" model="/lustre/S/huangdi/open_for_out/models/Qwen2.5-Coder-7B-Instruct/"
ftmodel="${project}/model/qwen25_coder_inst_7b-orm" ftmodel="${project}/model/qwen25_coder_inst_7b-orm"
testset="/lustre/S/nanziyuan/projects/ccc/data/test/qwen25_coder_inst-apps-test.jsonl"
evalresults="/lustre/S/nanziyuan/projects/ccc/data/eval/qwen25_code_inst-apps-test-orm-score.jsonl"
deepspeed --module \ deepspeed --module \
openrlhf.cli.train_rm \ openrlhf.cli.train_rm \
--save_path ${ftmodel} \ --save_path ${ftmodel} \
...@@ -31,41 +34,41 @@ openrlhf.cli.train_rm \ ...@@ -31,41 +34,41 @@ openrlhf.cli.train_rm \
--use_tensorboard "${ftmodel}_log" --use_tensorboard "${ftmodel}_log"
# start_server() { start_server() {
# echo "Starting server..." echo "Starting server..."
# CUDA_VISIBLE_DEVICES=0 \ CUDA_VISIBLE_DEVICES=0 \
# python -m openrlhf.cli.serve_rm \ python -m openrlhf.cli.serve_rm \
# --reward_pretrain ${model} \ --reward_pretrain ${ftmodel} \
# --normalize_reward \ --normalize_reward \
# --port 5000 \ --port 5000 \
# --bf16 \ --bf16 \
# --max_len 8192 & --max_len 8192 &
# SERVER_PID=$! SERVER_PID=$!
# echo "Server started with PID: $SERVER_PID" echo "Server started with PID: $SERVER_PID"
# } }
# # Function to start the client # Function to start the client
# start_client() { start_client() {
# echo "Starting client..." echo "Starting client..."
# python -m codecritic.cli.run_rm_test \ python -m codecritic.cli.test_orm \
# --model ${model} \ --model ${ftmodel} \
# --test "${datasets}/sample/min_test.jsonl" \ --testset ${testset} \
# --apps /lustre/S/nanziyuan/datasets/apps/ --output ${evalresults}
# CLIENT_EXIT_CODE=$? CLIENT_EXIT_CODE=$?
# echo "Client finished with exit code: $CLIENT_EXIT_CODE" echo "Client finished with exit code: $CLIENT_EXIT_CODE"
# } }
# # Function to stop the server # Function to stop the server
# stop_server() { stop_server() {
# echo "Stopping server..." echo "Stopping server..."
# kill -SIGINT $SERVER_PID kill -SIGINT $SERVER_PID
# wait $SERVER_PID 2>/dev/null wait $SERVER_PID 2>/dev/null
# echo "Server stopped." echo "Server stopped."
# } }
# start_server start_server
# # Give the server some time to initialize (optional) # Give the server some time to initialize (optional)
# sleep 60 sleep 60
# start_client start_client
# stop_server stop_server
# echo "Execution complete." echo "Execution complete."
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment