Commit 54409c18 by nzy

step2-4: add outcome reward model trained using SFT loss

parent 0ad3af4a
......@@ -66,4 +66,18 @@ score_result_path = ""
[critic.test.sampling_params]
n = 1
temperature = 0.0
max_new_tokens = 512
\ No newline at end of file
max_new_tokens = 512
[sftorm]
model_path = ""
datset_name = ""
dataset_path = ""
dataset_info_path = ""
[sftorm.train]
train_yaml_path = ""
deepspeed_cfg_path = ""
[sftorm.test]
prompt_path = ""
score_result_path = ""
\ No newline at end of file
from utils import load_jsonl, save_jsonl, save_json, extract_code, read_config
from utils_preference_dataset import mk_dataset_info, mk_preference_pair
from utils import load_jsonl, save_json, extract_code, read_config
from utils_preference_dataset import mk_preference_dataset_info, mk_preference_pair
from nltk.metrics.distance import edit_distance
from collections import defaultdict
from itertools import product, chain
......@@ -100,7 +100,7 @@ if __name__ == "__main__":
all_edit_distance_pairs, 10 * 1000, 5, is_max=True
)
max_dataset_cfg = cfg["preference_dataset"]["max_edit_distance"]
dataset_info = mk_dataset_info(max_dataset_cfg["dataset_name"])
dataset_info = mk_preference_dataset_info(max_dataset_cfg["dataset_name"])
save_json(metadata, max_dataset_cfg["metadata_path"])
save_json(
......@@ -117,7 +117,7 @@ if __name__ == "__main__":
all_edit_distance_pairs, 10 * 1000, 5, is_max=False
)
min_dataset_cfg = cfg["preference_dataset"]["min_edit_distance"]
dataset_info = mk_dataset_info(min_dataset_cfg["dataset_name"])
dataset_info = mk_preference_dataset_info(min_dataset_cfg["dataset_name"])
save_json(metadata, min_dataset_cfg["metadata_path"])
save_json(
preference_pairs,
......
# Additional Experiment:
# As we know, there are two primary methods for training a reward model:
# 1. Using reward loss
# 2. Using SFT (Supervised Fine-Tuning) directly
# This experiment aims to fairly compare these two approaches.
from utils import load_json, save_json
from utils_preference_dataset import mk_critic_qa, mk_critic_verify, mk_sft_item
def convert_preference_to_sft(item):
message = item["messages"][0]["content"]
chosen = item["chosen"]["content"]
rejected = item["rejected"]["content"]
messages1 = mk_critic_qa(message, chosen) + mk_critic_verify(True)
messages2 = mk_critic_qa(message, rejected) + mk_critic_verify(False)
return mk_sft_item(messages1), mk_sft_item(messages2)
if __name__ == "__main__":
preference_path = None
preference_dataset = load_json(preference_path)
sft_dataset = []
for item in preference_dataset:
sft_dataset.extend(convert_preference_to_sft(item))
\ No newline at end of file
......@@ -44,16 +44,17 @@ eval_steps: 500
def mk_llamafactory_sft_yaml(cfg):
with open(cfg["critic"]["train"]["train_yaml_path"], "w") as f:
model_type = cfg["model_type"]
with open(cfg[model_type]["train"]["train_yaml_path"], "w") as f:
train_str = train_yaml.format(
model_path=cfg["model"],
deepspeed_config_path=cfg["critic"]["train"]["deepspeed_cfg_path"],
dataset_name=cfg["critic"]["train"]["dataset_name"],
critic_model_path=cfg["critic"]["model_path"],
deepspeed_config_path=cfg[model_type]["train"]["deepspeed_cfg_path"],
dataset_name=cfg[model_type]["train"]["dataset_name"],
critic_model_path=cfg[model_type]["model_path"],
)
f.write(train_str)
if __name__ == "__main__":
cfg = read_config()
cfg = read_config(["model_type"])
mk_llamafactory_sft_yaml(cfg)
\ No newline at end of file
......@@ -17,7 +17,7 @@ score_token = score_tokens[0]
vllm_score(
cfg["critic"]["model_path"],
cfg["critic"]["test"]["reson_result_path"],
cfg["critic"]["test"]["reason_result_path"],
cfg["critic"]["test"]["score_result_path"],
score_token
)
\ No newline at end of file
......@@ -8,6 +8,7 @@ from copy import deepcopy
from utils import load_jsonl, save_jsonl, extract_code, read_config
from utils_metric import group_results, score_pass_at_k
from utils_preference_dataset import code_template
from transformers import AutoTokenizer
......@@ -70,7 +71,7 @@ def preprocess_dataset(model_path, test_dataset, gpu_num):
result = []
for i, item in enumerate(test_dataset):
messages = deepcopy(item["messages"])
messages[-1]["content"] = extract_code(messages[-1]["content"])
messages[-1]["content"] = code_template.format(extract_code(messages[-1]["content"]))
# https://github.com/hiyouga/LLaMA-Factory/blob/a45f3f5461e2936b9e119eda2ef4d8c7a4131740/tests/data/test_template.py#L58
# # llama factory's template should match tokenizer's `apply_chat_template`.
item["format_str"] = [tokenizer.apply_chat_template(messages, tokenize=False)]
......
from utils_vllm import vllm_score
from utils import read_config, load_jsonl, save_jsonl, extract_code
from utils_preference_dataset import code_template, mk_critic_qa, mk_critic_verify, mk_sft_item
from transformers import AutoTokenizer
def preprocess_test_item(item):
question = item["messages"][0]["content"]
answer = item["messages"][1]["content"]
code = code_template.format(extract_code(answer))
item["messages"] = mk_critic_qa(question, code) + mk_critic_verify()
return item
if __name__ == "__main__":
cfg = read_config()
raw_test_dataset = load_jsonl(cfg["dataset"]["minimal_test_path"])
test_dataset = [preprocess_test_item(item) for item in raw_test_dataset]
save_jsonl(test_dataset, cfg["sftorm"]["test"]["prompt_path"])
tokenizer = AutoTokenizer.from_pretrained(cfg["sftorm"]["model_path"])
score_tokens = tokenizer.encode("Yes")
assert len(score_tokens) == 1
score_token = score_tokens[0]
vllm_score(
cfg["sftorm"]["model_path"],
cfg["sftorm"]["test"]["prompt_path"],
cfg["sftorm"]["test"]["score_result_path"],
score_token
)
\ No newline at end of file
def mk_dataset_info(dataset_name):
def mk_preference_dataset_info(dataset_name):
return {
dataset_name: {
"file_name": f"{dataset_name}.json",
......@@ -19,14 +19,59 @@ def mk_dataset_info(dataset_name):
}
}
# see utils.extract_code
code_template = r"```python{}```"
def mk_preference_pair(instruction, chosen_code, rejected_code):
return {
"messages": [
{"role": "user", "content": instruction},
],
"chosen": {"role": "assistant", "content": code_template.format(chosen_code)},
"rejected": {"role": "assistant", "content": code_template.format(rejected_code)},
"rejected": {
"role": "assistant",
"content": code_template.format(rejected_code),
},
}
def mk_sft_dataset_info(dataset_name):
return {
dataset_name: {
"file_name": f"{dataset_name}.json",
"formatting": "sharegpt",
"columns": {"messages": "messages"},
"tags": {
"role_tag": "role",
"content_tag": "content",
"user_tag": "user",
"assistant_tag": "assistant",
"system_tag": "system",
},
}
}
# Note that the human and observation should appear in odd positions
# while llm should appear in even positions.
def mk_sft_item(messages):
return {"messages": messages}
def mk_critic_qa(instruction, code):
# Code should be enclosed in a markdown code block
return [
{"role": "user", "content": instruction},
{"role": "assistant", "content": code},
]
def mk_critic_verify(answer: bool | None = None):
message = [{"role": "user", "content": "Is the code correct (Yes/No)?"}]
if answer is not None:
response = "Yes" if answer else "No"
message.append({"role": "assistant", "content": response})
return message
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment