""" Preprocess dataset for codev task """

import os
from datasets import Dataset, load_dataset
from tqdm import tqdm
from verl.utils.hdfs_io import copy, makedirs
import argparse
import json
from pprint import pprint
from transformers import AutoTokenizer


def mk_prompt_r1_v1(question):
    question = question.replace("Enclose your code with [BEGIN] and [DONE]. Only output the code snippet\nand do NOT output anything else.\n\n", "")

    pos = question.find("The module head of the code should be:")
    if pos >= 0:
        question = question[:pos]
    pos = question.find('Now, try to write the corresponding verilog code based on the following content through the above guidelines:\n')
    if pos >= 0:
        question = question[pos + len('Now, try to write the corresponding verilog code based on the following content through the above guidelines:\n'):]

    system_prompt = """You are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and<answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>.  Now the user asks you to write verilog code. After thinking, when you finally reach a conclusion, enclose the final verilog code in ```verilog ``` within <answer> </answer> tags. i.e., <answer> ```verilog\n module top_module(in, out, ...) ... ``` </answer>.\n"""
    user_prompt = question.strip() + "\n"
    conversation = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
    return conversation


def mk_prompt_r1_v1_1(question):
    question = question.replace("Enclose your code with [BEGIN] and [DONE]. Only output the code snippet\nand do NOT output anything else.\n\n", "")

    pos = question.find('Now, try to write the corresponding verilog code based on the following content through the above guidelines:\n')
    if pos >= 0:
        question = question[pos + len('Now, try to write the corresponding verilog code based on the following content through the above guidelines:\n'):]

    system_prompt = """You are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and<answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>.  Now the user asks you to write verilog code. After thinking, when you finally reach a conclusion, enclose the final verilog code in ```verilog ``` within <answer> </answer> tags. i.e., <answer> ```verilog\n module top_module(in, out, ...) ... ``` </answer>.\n"""
    user_prompt = question.strip() + "\n"
    conversation = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
    return conversation


def mk_prompt_r1_v2(question):
    # 格式要改
    question = question.replace("Enclose your code with [BEGIN] and [DONE]. Only output the code snippet\nand do NOT output anything else.\n\n", "")

    prompt = f"""<|im_start|>system
You are a helpful assistant.
<|im_end|>
<|im_start|>user
Your role as an assistant involves thoroughly exploring questions through a systematic long thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process.\n\nPlease structure your response into two main sections: Thought and Solution.\n\nIn the Thought section, detail your reasoning process using the specified format:\n```\n<think>\n{{thought with steps separated with \"\n\n\"}}\n</think>\n```\nEach step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps.\n\nIn the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The solution should remain a logical, accurate, concise expression style and detail necessary step needed to reach the conclusion, formatted as follows:\n```\n<answer>\n{{final formatted, precise, and clear solution}}\n</answer>\n```\nNow, try to write the corresponding verilog code based on the following content through the above guidelines:
{question}
<|im_end|>
<|im_start|>assistant
<think>"""
    conversation = [{"role": "user", "content": prompt}]
    return conversation


def mk_prompt_r1_v3(question):
    
    question = question.replace("Enclose your code with [BEGIN] and [DONE]. Only output the code snippet\nand do NOT output anything else.\n\n", "")
    pos = question.find("The module head of the code should be:")
    if pos >= 0:
        question = question[:pos]

    prompt = f"""<|im_start|>system\nA conversation between User and Assistant. The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>.\n<|im_end|>\n<|im_start|>user\nNow, try to write the corresponding verilog code based on the following content:\n{question}\n<|im_end|>\n<|im_start|>assistant\n<think>"""
    conversation = [{"role": "user", "content": prompt}]
    return conversation


def mk_prompt_r1(question):
    question = question.replace("Enclose your code with [BEGIN] and [DONE]. Only output the code snippet\nand do NOT output anything else.\n\n", "")
    # pos = question.find("The module head of the code should be:")
    # if pos >= 0:
    #     question = question[:pos]
    system_prompt = """You are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and<answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>.  Now the user asks you to write verilog code. After thinking, when you finally reach a conclusion, enclose the final verilog code in ```verilog ``` within <answer> </answer> tags. i.e., <answer> ```verilog\n module top_module(in, out, ...) ... ``` </answer>.\n"""
    user_prompt = question.strip() + "\n"
    # prompt = f"""<|im_start|>system\nYou are a helpful assistant. The assistant first thinks about the reasoning process in the mind and then provides the user with the answer. The reasoning process and answer are enclosed within <think> </think> and<answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>.  Now the user asks you to write verilog code. After thinking, when you finally reach a conclusion, enclose the final verilog code in ```verilog ``` within <answer> </answer> tags. i.e., <answer> ```verilog\n module top_module(in, out, ...) ... ``` </answer>.\n<|im_end|>\n<|im_start|>user\n{question}\n<|im_end|>\n<|im_start|>assistant\n<think>"""
    conversation = [{"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}]
    return conversation


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--local_dir', default='data/codev/v1/3.1k_r1_filtered')
    parser.add_argument('--hdfs_dir', default=None)
    # parser.add_argument('--data_path', default='/nfs_global/S/lvhanqi/codev_data/decontamination_sft_model_filter_4.8k_and_qwen_32b_correct1234_system_prompt_codev_dataset_v3.jsonl')
    parser.add_argument('--data_path', default='/nfs_global/S/lvhanqi/codev_data/decontamination_sft_model_filter_0320_correct_synthesizable_r1_system_prompt_filter_codev_dataset_165k_v3.jsonl')
    parser.add_argument('--tokenizer_path', default='/share/collab/codemodel/models/Qwen2.5-Coder-7B')
    parser.add_argument('--train_size', type=int, default=15000)
    parser.add_argument('--test_size', type=int, default=984)
    parser.add_argument('--save_jsonl', action='store_true', help='Save dataset as jsonl files')
    parser.add_argument('--continuous_reward', action='store_true', help='Save dataset as jsonl files')
    # parser.add_argument('--template_type', type=str, default='base')
    
    args = parser.parse_args()
    
    TRAIN_SIZE = args.train_size
    TEST_SIZE = args.test_size

    # Load custom JSONL dataset
    def gen_from_jsonl(path):
        with open(path) as f:
            for line in f:
                yield json.loads(line)

    # 下面这句话有问题，hf在这会用缓存，数据更新之后它还是load原来的东西。太抽象了！！！
    # raw_dataset = Dataset.from_generator(gen_from_jsonl, gen_kwargs={'path': args.data_path})
    raw_data_list = list(gen_from_jsonl(args.data_path))
    raw_dataset = Dataset.from_list(raw_data_list)

    def make_question(question):
        if isinstance(question, list) and len(question) == 2 and question[0]["role"] == "system" and question[1]["role"] == "user":
            return question
        else:
            if not isinstance(question, str):
                question = question[0]["content"]
            question = mk_prompt_r1(question)
        return question

    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path)
    def filter_by_token_length(example):
        question = make_question(example["question"])
        tokens = tokenizer(question[1]["content"], truncation=False, padding=False)
        token_length = len(tokens["input_ids"])
        return token_length <= 1500
    raw_dataset = raw_dataset.filter(filter_by_token_length)
    raw_dataset = raw_dataset.shuffle(seed=42)
    print(len(raw_dataset))

    assert len(raw_dataset) >= TRAIN_SIZE + TEST_SIZE
    train_dataset = raw_dataset.select(range(TRAIN_SIZE))
    test_dataset = raw_dataset.select(range(TRAIN_SIZE, TRAIN_SIZE + TEST_SIZE))

    def make_map_fn(split):
        def process_fn(example, idx):
            if "ground_truth" in example:
                ground_truth = example["ground_truth"][0]["content"]
            else:
                ground_truth = example["response"]
            question = make_question(example["question"])
            if args.continuous_reward:
                ground_truth = {"answer": ground_truth, "reward_mode": "continuous"}
            
            data = {
                "data_source": "codev",
                "prompt": question,
                "ability": "verilog",
                "reward_model": {
                    "style": "rule",
                    "ground_truth": ground_truth
                },
                "extra_info": {
                    'split': split,
                    'index': idx,
                }
            }
            return data
        return process_fn

    train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True)
    test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True)

    local_dir = args.local_dir
    hdfs_dir = args.hdfs_dir

    # Create local directory if not exists
    os.makedirs(os.path.expanduser(local_dir), exist_ok=True)

    train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
    test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))

    if args.save_jsonl:
        # Save train dataset as jsonl
        with open(os.path.join(local_dir, 'train.jsonl'), 'w') as f:
            for example in train_dataset:
                f.write(json.dumps(example) + '\n')

        # Save test dataset as jsonl
        with open(os.path.join(local_dir, 'test.jsonl'), 'w') as f:
            for example in test_dataset:
                f.write(json.dumps(example) + '\n')
        print("JSONL file saved!!")

    if hdfs_dir is not None:
        makedirs(hdfs_dir)
        copy(src=local_dir, dst=hdfs_dir)