step2-4: add outcome reward model trained using SFT loss

54409c18 · nzy · 0ad3af4a · 54409c18 · 54409c18 · 54409c18
Commit 54409c18 authored Oct 17, 2024 by nzy
8 changed files
--- a/example_config.toml
+++ b/example_config.toml
@@ -66,4 +66,18 @@ score_result_path = ""
 [critic.test.sampling_params]
 n = 1
 temperature = 0.0
 max_new_tokens = 512
\ No newline at end of file
+[sftorm]
+model_path = ""
+datset_name = ""
+dataset_path = ""
+dataset_info_path = ""
+[sftorm.train]
+train_yaml_path = ""
+deepspeed_cfg_path = ""
+[sftorm.test]
+prompt_path = ""
+score_result_path = ""
\ No newline at end of file
--- a/step2_prepare_preference_dataset.py
+++ b/step2_prepare_preference_dataset.py
-from utils import load_jsonl, save_jsonl, save_json, extract_code, read_config
+from utils import load_jsonl, save_json, extract_code, read_config
-from utils_preference_dataset import mk_dataset_info, mk_preference_pair
+from utils_preference_dataset import mk_preference_dataset_info, mk_preference_pair
 from nltk.metrics.distance import edit_distance
 from collections import defaultdict
 from itertools import product, chain
@@ -100,7 +100,7 @@ if __name__ == "__main__":
        all_edit_distance_pairs, 10 * 1000, 5, is_max=True
    )
    max_dataset_cfg = cfg["preference_dataset"]["max_edit_distance"]
-    dataset_info = mk_dataset_info(max_dataset_cfg["dataset_name"])
+    dataset_info = mk_preference_dataset_info(max_dataset_cfg["dataset_name"])
    save_json(metadata, max_dataset_cfg["metadata_path"])
    save_json(
@@ -117,7 +117,7 @@ if __name__ == "__main__":
        all_edit_distance_pairs, 10 * 1000, 5, is_max=False
    )
    min_dataset_cfg = cfg["preference_dataset"]["min_edit_distance"]
-    dataset_info = mk_dataset_info(min_dataset_cfg["dataset_name"])
+    dataset_info = mk_preference_dataset_info(min_dataset_cfg["dataset_name"])
    save_json(metadata, min_dataset_cfg["metadata_path"])
    save_json(
        preference_pairs,

--- a/step2_sftorm_convert_preference_to_sft.py
+++ b/step2_sftorm_convert_preference_to_sft.py
+# Additional Experiment:
+# As we know, there are two primary methods for training a reward model:
+# 1. Using reward loss
+# 2. Using SFT (Supervised Fine-Tuning) directly
+# This experiment aims to fairly compare these two approaches.
+from utils import load_json, save_json
+from utils_preference_dataset import mk_critic_qa, mk_critic_verify, mk_sft_item
+def convert_preference_to_sft(item):
+    message = item["messages"][0]["content"]
+    chosen = item["chosen"]["content"]
+    rejected = item["rejected"]["content"]
+    messages1 = mk_critic_qa(message, chosen) + mk_critic_verify(True)
+    messages2 = mk_critic_qa(message, rejected) + mk_critic_verify(False)
+    return mk_sft_item(messages1), mk_sft_item(messages2)
+if __name__ == "__main__":
+    preference_path = None
+    preference_dataset = load_json(preference_path)
+    sft_dataset = []
+    for item in preference_dataset:
+        sft_dataset.extend(convert_preference_to_sft(item))
\ No newline at end of file
--- a/step3_train_critic_model.py
+++ b/step3_train_critic_model.py
@@ -44,16 +44,17 @@ eval_steps: 500
 def mk_llamafactory_sft_yaml(cfg):
-    with open(cfg["critic"]["train"]["train_yaml_path"], "w") as f:
+    model_type = cfg["model_type"]
+    with open(cfg[model_type]["train"]["train_yaml_path"], "w") as f:
        train_str = train_yaml.format(
            model_path=cfg["model"],
-            deepspeed_config_path=cfg["critic"]["train"]["deepspeed_cfg_path"],
+            deepspeed_config_path=cfg[model_type]["train"]["deepspeed_cfg_path"],
-            dataset_name=cfg["critic"]["train"]["dataset_name"],
+            dataset_name=cfg[model_type]["train"]["dataset_name"],
-            critic_model_path=cfg["critic"]["model_path"],   
+            critic_model_path=cfg[model_type]["model_path"],   
        )
        f.write(train_str)
 if __name__ == "__main__":
-    cfg = read_config()
+    cfg = read_config(["model_type"])
    mk_llamafactory_sft_yaml(cfg)
\ No newline at end of file
--- a/step4_test_critic_model.py
+++ b/step4_test_critic_model.py
@@ -17,7 +17,7 @@ score_token = score_tokens[0]
 vllm_score(
    cfg["critic"]["model_path"],
-    cfg["critic"]["test"]["reson_result_path"],
+    cfg["critic"]["test"]["reason_result_path"],
    cfg["critic"]["test"]["score_result_path"],
    score_token
 )
\ No newline at end of file
--- a/step4_test_reward_model.py
+++ b/step4_test_reward_model.py
@@ -8,6 +8,7 @@ from copy import deepcopy
 from utils import load_jsonl, save_jsonl, extract_code, read_config
 from utils_metric import group_results, score_pass_at_k
+from utils_preference_dataset import code_template
 from transformers import AutoTokenizer
@@ -70,7 +71,7 @@ def preprocess_dataset(model_path, test_dataset, gpu_num):
    result = []
    for i, item in enumerate(test_dataset):
        messages = deepcopy(item["messages"])
-        messages[-1]["content"] = extract_code(messages[-1]["content"])
+        messages[-1]["content"] = code_template.format(extract_code(messages[-1]["content"]))
        # https://github.com/hiyouga/LLaMA-Factory/blob/a45f3f5461e2936b9e119eda2ef4d8c7a4131740/tests/data/test_template.py#L58
        # # llama factory's template should match tokenizer's `apply_chat_template`.
        item["format_str"] = [tokenizer.apply_chat_template(messages, tokenize=False)]

--- a/step4_test_sftorm.py
+++ b/step4_test_sftorm.py
+from utils_vllm import vllm_score
+from utils import read_config, load_jsonl, save_jsonl, extract_code
+from utils_preference_dataset import code_template, mk_critic_qa, mk_critic_verify, mk_sft_item
+from transformers import AutoTokenizer
+def preprocess_test_item(item):
+    question = item["messages"][0]["content"]
+    answer = item["messages"][1]["content"]
+    code = code_template.format(extract_code(answer))
+    item["messages"] = mk_critic_qa(question, code) + mk_critic_verify()
+    return item
+if __name__ == "__main__":
+    cfg = read_config()
+    raw_test_dataset = load_jsonl(cfg["dataset"]["minimal_test_path"])
+    test_dataset = [preprocess_test_item(item) for item in raw_test_dataset]
+    save_jsonl(test_dataset, cfg["sftorm"]["test"]["prompt_path"])
+    tokenizer = AutoTokenizer.from_pretrained(cfg["sftorm"]["model_path"])
+    score_tokens = tokenizer.encode("Yes")
+    assert len(score_tokens) == 1
+    score_token = score_tokens[0]
+    vllm_score(
+        cfg["sftorm"]["model_path"],
+        cfg["sftorm"]["test"]["prompt_path"],
+        cfg["sftorm"]["test"]["score_result_path"],
+        score_token
+    )
\ No newline at end of file
--- a/utils_preference_dataset.py
+++ b/utils_preference_dataset.py
-def mk_dataset_info(dataset_name):
+def mk_preference_dataset_info(dataset_name):
    return {
        dataset_name: {
            "file_name": f"{dataset_name}.json",
@@ -19,14 +19,59 @@ def mk_dataset_info(dataset_name):
        }
    }
 # see utils.extract_code
 code_template = r"```python{}```"
 def mk_preference_pair(instruction, chosen_code, rejected_code):
    return {
        "messages": [
            {"role": "user", "content": instruction},
        ],
        "chosen": {"role": "assistant", "content": code_template.format(chosen_code)},
-        "rejected": {"role": "assistant", "content": code_template.format(rejected_code)},
+        "rejected": {
+            "role": "assistant",
+            "content": code_template.format(rejected_code),
+        },
+    }
+def mk_sft_dataset_info(dataset_name):
+    return {
+        dataset_name: {
+            "file_name": f"{dataset_name}.json",
+            "formatting": "sharegpt",
+            "columns": {"messages": "messages"},
+            "tags": {
+                "role_tag": "role",
+                "content_tag": "content",
+                "user_tag": "user",
+                "assistant_tag": "assistant",
+                "system_tag": "system",
+            },
+        }
    }
+# Note that the human and observation should appear in odd positions
+# while llm should appear in even positions.
+def mk_sft_item(messages):
+    return {"messages": messages}
+def mk_critic_qa(instruction, code):
+    # Code should be enclosed in a markdown code block
+    return [
+        {"role": "user", "content": instruction},
+        {"role": "assistant", "content": code},
+    ]
+def mk_critic_verify(answer: bool | None = None):
+    message = [{"role": "user", "content": "Is the code correct (Yes/No)?"}]
+    if answer is not None:
+        response = "Yes" if answer else "No"
+        message.append({"role": "assistant", "content": response})
+    return message