Commit 54409c18 by nzy

step2-4: add outcome reward model trained using SFT loss

parent 0ad3af4a
...@@ -66,4 +66,18 @@ score_result_path = "" ...@@ -66,4 +66,18 @@ score_result_path = ""
[critic.test.sampling_params] [critic.test.sampling_params]
n = 1 n = 1
temperature = 0.0 temperature = 0.0
max_new_tokens = 512 max_new_tokens = 512
\ No newline at end of file
[sftorm]
model_path = ""
datset_name = ""
dataset_path = ""
dataset_info_path = ""
[sftorm.train]
train_yaml_path = ""
deepspeed_cfg_path = ""
[sftorm.test]
prompt_path = ""
score_result_path = ""
\ No newline at end of file
from utils import load_jsonl, save_jsonl, save_json, extract_code, read_config from utils import load_jsonl, save_json, extract_code, read_config
from utils_preference_dataset import mk_dataset_info, mk_preference_pair from utils_preference_dataset import mk_preference_dataset_info, mk_preference_pair
from nltk.metrics.distance import edit_distance from nltk.metrics.distance import edit_distance
from collections import defaultdict from collections import defaultdict
from itertools import product, chain from itertools import product, chain
...@@ -100,7 +100,7 @@ if __name__ == "__main__": ...@@ -100,7 +100,7 @@ if __name__ == "__main__":
all_edit_distance_pairs, 10 * 1000, 5, is_max=True all_edit_distance_pairs, 10 * 1000, 5, is_max=True
) )
max_dataset_cfg = cfg["preference_dataset"]["max_edit_distance"] max_dataset_cfg = cfg["preference_dataset"]["max_edit_distance"]
dataset_info = mk_dataset_info(max_dataset_cfg["dataset_name"]) dataset_info = mk_preference_dataset_info(max_dataset_cfg["dataset_name"])
save_json(metadata, max_dataset_cfg["metadata_path"]) save_json(metadata, max_dataset_cfg["metadata_path"])
save_json( save_json(
...@@ -117,7 +117,7 @@ if __name__ == "__main__": ...@@ -117,7 +117,7 @@ if __name__ == "__main__":
all_edit_distance_pairs, 10 * 1000, 5, is_max=False all_edit_distance_pairs, 10 * 1000, 5, is_max=False
) )
min_dataset_cfg = cfg["preference_dataset"]["min_edit_distance"] min_dataset_cfg = cfg["preference_dataset"]["min_edit_distance"]
dataset_info = mk_dataset_info(min_dataset_cfg["dataset_name"]) dataset_info = mk_preference_dataset_info(min_dataset_cfg["dataset_name"])
save_json(metadata, min_dataset_cfg["metadata_path"]) save_json(metadata, min_dataset_cfg["metadata_path"])
save_json( save_json(
preference_pairs, preference_pairs,
......
# Additional Experiment:
# As we know, there are two primary methods for training a reward model:
# 1. Using reward loss
# 2. Using SFT (Supervised Fine-Tuning) directly
# This experiment aims to fairly compare these two approaches.
from utils import load_json, save_json
from utils_preference_dataset import mk_critic_qa, mk_critic_verify, mk_sft_item
def convert_preference_to_sft(item):
message = item["messages"][0]["content"]
chosen = item["chosen"]["content"]
rejected = item["rejected"]["content"]
messages1 = mk_critic_qa(message, chosen) + mk_critic_verify(True)
messages2 = mk_critic_qa(message, rejected) + mk_critic_verify(False)
return mk_sft_item(messages1), mk_sft_item(messages2)
if __name__ == "__main__":
preference_path = None
preference_dataset = load_json(preference_path)
sft_dataset = []
for item in preference_dataset:
sft_dataset.extend(convert_preference_to_sft(item))
\ No newline at end of file
...@@ -44,16 +44,17 @@ eval_steps: 500 ...@@ -44,16 +44,17 @@ eval_steps: 500
def mk_llamafactory_sft_yaml(cfg): def mk_llamafactory_sft_yaml(cfg):
with open(cfg["critic"]["train"]["train_yaml_path"], "w") as f: model_type = cfg["model_type"]
with open(cfg[model_type]["train"]["train_yaml_path"], "w") as f:
train_str = train_yaml.format( train_str = train_yaml.format(
model_path=cfg["model"], model_path=cfg["model"],
deepspeed_config_path=cfg["critic"]["train"]["deepspeed_cfg_path"], deepspeed_config_path=cfg[model_type]["train"]["deepspeed_cfg_path"],
dataset_name=cfg["critic"]["train"]["dataset_name"], dataset_name=cfg[model_type]["train"]["dataset_name"],
critic_model_path=cfg["critic"]["model_path"], critic_model_path=cfg[model_type]["model_path"],
) )
f.write(train_str) f.write(train_str)
if __name__ == "__main__": if __name__ == "__main__":
cfg = read_config() cfg = read_config(["model_type"])
mk_llamafactory_sft_yaml(cfg) mk_llamafactory_sft_yaml(cfg)
\ No newline at end of file
...@@ -17,7 +17,7 @@ score_token = score_tokens[0] ...@@ -17,7 +17,7 @@ score_token = score_tokens[0]
vllm_score( vllm_score(
cfg["critic"]["model_path"], cfg["critic"]["model_path"],
cfg["critic"]["test"]["reson_result_path"], cfg["critic"]["test"]["reason_result_path"],
cfg["critic"]["test"]["score_result_path"], cfg["critic"]["test"]["score_result_path"],
score_token score_token
) )
\ No newline at end of file
...@@ -8,6 +8,7 @@ from copy import deepcopy ...@@ -8,6 +8,7 @@ from copy import deepcopy
from utils import load_jsonl, save_jsonl, extract_code, read_config from utils import load_jsonl, save_jsonl, extract_code, read_config
from utils_metric import group_results, score_pass_at_k from utils_metric import group_results, score_pass_at_k
from utils_preference_dataset import code_template
from transformers import AutoTokenizer from transformers import AutoTokenizer
...@@ -70,7 +71,7 @@ def preprocess_dataset(model_path, test_dataset, gpu_num): ...@@ -70,7 +71,7 @@ def preprocess_dataset(model_path, test_dataset, gpu_num):
result = [] result = []
for i, item in enumerate(test_dataset): for i, item in enumerate(test_dataset):
messages = deepcopy(item["messages"]) messages = deepcopy(item["messages"])
messages[-1]["content"] = extract_code(messages[-1]["content"]) messages[-1]["content"] = code_template.format(extract_code(messages[-1]["content"]))
# https://github.com/hiyouga/LLaMA-Factory/blob/a45f3f5461e2936b9e119eda2ef4d8c7a4131740/tests/data/test_template.py#L58 # https://github.com/hiyouga/LLaMA-Factory/blob/a45f3f5461e2936b9e119eda2ef4d8c7a4131740/tests/data/test_template.py#L58
# # llama factory's template should match tokenizer's `apply_chat_template`. # # llama factory's template should match tokenizer's `apply_chat_template`.
item["format_str"] = [tokenizer.apply_chat_template(messages, tokenize=False)] item["format_str"] = [tokenizer.apply_chat_template(messages, tokenize=False)]
......
from utils_vllm import vllm_score
from utils import read_config, load_jsonl, save_jsonl, extract_code
from utils_preference_dataset import code_template, mk_critic_qa, mk_critic_verify, mk_sft_item
from transformers import AutoTokenizer
def preprocess_test_item(item):
question = item["messages"][0]["content"]
answer = item["messages"][1]["content"]
code = code_template.format(extract_code(answer))
item["messages"] = mk_critic_qa(question, code) + mk_critic_verify()
return item
if __name__ == "__main__":
cfg = read_config()
raw_test_dataset = load_jsonl(cfg["dataset"]["minimal_test_path"])
test_dataset = [preprocess_test_item(item) for item in raw_test_dataset]
save_jsonl(test_dataset, cfg["sftorm"]["test"]["prompt_path"])
tokenizer = AutoTokenizer.from_pretrained(cfg["sftorm"]["model_path"])
score_tokens = tokenizer.encode("Yes")
assert len(score_tokens) == 1
score_token = score_tokens[0]
vllm_score(
cfg["sftorm"]["model_path"],
cfg["sftorm"]["test"]["prompt_path"],
cfg["sftorm"]["test"]["score_result_path"],
score_token
)
\ No newline at end of file
def mk_dataset_info(dataset_name): def mk_preference_dataset_info(dataset_name):
return { return {
dataset_name: { dataset_name: {
"file_name": f"{dataset_name}.json", "file_name": f"{dataset_name}.json",
...@@ -19,14 +19,59 @@ def mk_dataset_info(dataset_name): ...@@ -19,14 +19,59 @@ def mk_dataset_info(dataset_name):
} }
} }
# see utils.extract_code # see utils.extract_code
code_template = r"```python{}```" code_template = r"```python{}```"
def mk_preference_pair(instruction, chosen_code, rejected_code): def mk_preference_pair(instruction, chosen_code, rejected_code):
return { return {
"messages": [ "messages": [
{"role": "user", "content": instruction}, {"role": "user", "content": instruction},
], ],
"chosen": {"role": "assistant", "content": code_template.format(chosen_code)}, "chosen": {"role": "assistant", "content": code_template.format(chosen_code)},
"rejected": {"role": "assistant", "content": code_template.format(rejected_code)}, "rejected": {
"role": "assistant",
"content": code_template.format(rejected_code),
},
}
def mk_sft_dataset_info(dataset_name):
return {
dataset_name: {
"file_name": f"{dataset_name}.json",
"formatting": "sharegpt",
"columns": {"messages": "messages"},
"tags": {
"role_tag": "role",
"content_tag": "content",
"user_tag": "user",
"assistant_tag": "assistant",
"system_tag": "system",
},
}
} }
# Note that the human and observation should appear in odd positions
# while llm should appear in even positions.
def mk_sft_item(messages):
return {"messages": messages}
def mk_critic_qa(instruction, code):
# Code should be enclosed in a markdown code block
return [
{"role": "user", "content": instruction},
{"role": "assistant", "content": code},
]
def mk_critic_verify(answer: bool | None = None):
message = [{"role": "user", "content": "Is the code correct (Yes/No)?"}]
if answer is not None:
response = "Yes" if answer else "No"
message.append({"role": "assistant", "content": response})
return message
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment