# Additional Experiment:
# As we know, there are two primary methods for training a reward model:
# 1. Using reward loss
# 2. Using SFT (Supervised Fine-Tuning) directly
# This experiment aims to fairly compare these two approaches.
import argparse
from codecritic.utils.json import load_json
from codecritic.utils.data import mk_critic_qa, mk_critic_verify, mk_sft_item, mk_sft_dataset_info, save_dataset


def convert_preference_to_sft(item):
    message = item["messages"][0]["content"]
    chosen = item["chosen"]["content"]
    rejected = item["rejected"]["content"]

    messages1 = mk_critic_qa(message, chosen) + mk_critic_verify(True)
    messages2 = mk_critic_qa(message, rejected) + mk_critic_verify(False)
    return mk_sft_item(messages1), mk_sft_item(messages2)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--preference_dataset", type=str)
    parser.add_argument("--llamafactory", type=str)
    parser.add_argument("--dataset_name", type=str)
    args = parser.parse_args()

    preference_dataset = load_json(args.preference_dataset)

    sft_dataset = []
    for item in preference_dataset:
        sft_dataset.extend(convert_preference_to_sft(item))

    dataset_info = mk_sft_dataset_info(args.dataset_name)
    save_dataset(args.llamafactory, dataset_info, sft_dataset)
