# Additional Experiment:
# As we know, there are two primary methods for training a reward model:
# 1. Using reward loss
# 2. Using SFT (Supervised Fine-Tuning) directly
# This experiment aims to fairly compare these two approaches.
import argparse
from codecritic.data.utils import mk_message, save_jsonl_dataset
from codecritic.utils.json import load_jsonl
from codecritic.data.verify import mk_critic_verify


def mk_sft_dataset(messages):
    question = messages[:-1]
    response = messages[-1:]
    return dict(question=question, response=response)


def convert_preference_to_sft(item):
    message = item["messages"]
    chosen = item["chosen"]
    rejected = item["rejected"]



    messages1 = message + chosen + mk_critic_verify(True)
    messages2 = message + rejected + mk_critic_verify(False)
    return mk_sft_dataset(messages1), mk_sft_dataset(messages2)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--preference_dataset", type=str)
    parser.add_argument("--output_dir", type=str)
    parser.add_argument("--split", type=str, default="train")
    args = parser.parse_args()

    preference_dataset = load_jsonl(args.preference_dataset)

    sft_dataset = []
    for item in preference_dataset:
        sft_dataset.extend(convert_preference_to_sft(item))

    save_jsonl_dataset(sft_dataset, args.output_dir, args.split)
