feat: support pretrain for minicpm

beaacea5 · wyt2000 · 2d2620bd · beaacea5 · beaacea5 · beaacea5
Commit beaacea5 authored Aug 30, 2024 by wyt2000
77 changed files
--- a/pretrain/LLaMA-Factory/.gitignore
+++ b/pretrain/LLaMA-Factory/.gitignore
@@ -167,4 +167,5 @@ config/
 saves/
 output/
 wandb/
-data/
+submit.sh
+ret_one
--- a/pretrain/LLaMA-Factory/examples/README.md
+++ b/pretrain/LLaMA-Factory/examples/README.md
--- a/pretrain/LLaMA-Factory/examples/README_zh.md
+++ b/pretrain/LLaMA-Factory/examples/README_zh.md
--- a/pretrain/LLaMA-Factory/configs/minicpm_pretrain.yaml
+++ b/pretrain/LLaMA-Factory/configs/minicpm_pretrain.yaml
+### model
+model_name_or_path: /lustre/S/huangdi/open_for_out/models/MiniCPM-training/models/MiniCPM-quant
+
+### method
+stage: pt
+do_train: true
+finetuning_type: full 
+
+### dataset
+dataset: c4_demo
+cutoff_len: 2048 
+overwrite_cache: true
+preprocessing_num_workers: 80 
+# streaming: true 
+# max_steps: 102500
+# should_process: false
+
+### output
+output_dir: saves/bitnet-cpt-on-the-stack-all-32x40g-r8a100b-0726
+logging_steps: 1
+save_steps: 3000 
+plot_loss: true
+overwrite_output_dir: true
+
+### train
+# resume_from_checkpoint: saves/bitnet-cpt-on-the-stack-all-32x40g-r8a100b-0726/checkpoint-75000
+per_device_train_batch_size: 8 
+gradient_accumulation_steps: 1 
+learning_rate: 5.0e-5
+num_train_epochs: 1.0
+lr_scheduler_type: cosine
+warmup_steps: 375 
+adam_beta1: 0.9
+adam_beta2: 0.95
+bf16: true
+ddp_timeout: 180000000
+cache_dir: /lustre/S/huangdi/open_for_out/models/MiniCPM-training/pretrain/LLaMA-Factory/cache
--- a/pretrain/LLaMA-Factory/data/README.md
+++ b/pretrain/LLaMA-Factory/data/README.md
+The [dataset_info.json](dataset_info.json) contains all available datasets. If you are using a custom dataset, please **make sure** to add a *dataset description* in `dataset_info.json` and specify `dataset: dataset_name` before training to use it.
+
+Currently we support datasets in **alpaca** and **sharegpt** format.
+
+```json
+"dataset_name": {
+  "hf_hub_url": "the name of the dataset repository on the Hugging Face hub. (if specified, ignore script_url and file_name)",
+  "ms_hub_url": "the name of the dataset repository on the Model Scope hub. (if specified, ignore script_url and file_name)",
+  "script_url": "the name of the directory containing a dataset loading script. (if specified, ignore file_name)",
+  "file_name": "the name of the dataset folder or dataset file in this directory. (required if above are not specified)",
+  "formatting": "the format of the dataset. (optional, default: alpaca, can be chosen from {alpaca, sharegpt})",
+  "ranking": "whether the dataset is a preference dataset or not. (default: False)",
+  "subset": "the name of the subset. (optional, default: None)",
+  "split": "the name of dataset split to be used. (optional, default: train)",
+  "folder": "the name of the folder of the dataset repository on the Hugging Face hub. (optional, default: None)",
+  "num_samples": "the number of samples in the dataset to be used. (optional, default: None)",
+  "columns (optional)": {
+    "prompt": "the column name in the dataset containing the prompts. (default: instruction)",
+    "query": "the column name in the dataset containing the queries. (default: input)",
+    "response": "the column name in the dataset containing the responses. (default: output)",
+    "history": "the column name in the dataset containing the histories. (default: None)",
+    "messages": "the column name in the dataset containing the messages. (default: conversations)",
+    "system": "the column name in the dataset containing the system prompts. (default: None)",
+    "tools": "the column name in the dataset containing the tool description. (default: None)",
+    "images": "the column name in the dataset containing the image inputs. (default: None)",
+    "chosen": "the column name in the dataset containing the chosen answers. (default: None)",
+    "rejected": "the column name in the dataset containing the rejected answers. (default: None)",
+    "kto_tag": "the column name in the dataset containing the kto tags. (default: None)"
+  },
+  "tags (optional, used for the sharegpt format)": {
+    "role_tag": "the key in the message represents the identity. (default: from)",
+    "content_tag": "the key in the message represents the content. (default: value)",
+    "user_tag": "the value of the role_tag represents the user. (default: human)",
+    "assistant_tag": "the value of the role_tag represents the assistant. (default: gpt)",
+    "observation_tag": "the value of the role_tag represents the tool results. (default: observation)",
+    "function_tag": "the value of the role_tag represents the function call. (default: function_call)",
+    "system_tag": "the value of the role_tag represents the system prompt. (default: system, can override system column)"
+  }
+}
+```
+
+## Alpaca Format
+
+### Supervised Fine-Tuning Dataset
+
+* [Example dataset](alpaca_en_demo.json)
+
+In supervised fine-tuning, the `instruction` column will be concatenated with the `input` column and used as the human prompt, then the human prompt would be `instruction\ninput`. The `output` column represents the model response.
+
+The `system` column will be used as the system prompt if specified.
+
+The `history` column is a list consisting of string tuples representing prompt-response pairs in the history messages. Note that the responses in the history **will also be learned by the model** in supervised fine-tuning.
+
+```json
+[
+  {
+    "instruction": "human instruction (required)",
+    "input": "human input (optional)",
+    "output": "model response (required)",
+    "system": "system prompt (optional)",
+    "history": [
+      ["human instruction in the first round (optional)", "model response in the first round (optional)"],
+      ["human instruction in the second round (optional)", "model response in the second round (optional)"]
+    ]
+  }
+]
+```
+
+Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
+
+```json
+"dataset_name": {
+  "file_name": "data.json",
+  "columns": {
+    "prompt": "instruction",
+    "query": "input",
+    "response": "output",
+    "system": "system",
+    "history": "history"
+  }
+}
+```
+
+### Pre-training Dataset
+
+- [Example dataset](c4_demo.json)
+
+In pre-training, only the `text` column will be used for model learning.
+
+```json
+[
+  {"text": "document"},
+  {"text": "document"}
+]
+```
+
+Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
+
+```json
+"dataset_name": {
+  "file_name": "data.json",
+  "columns": {
+    "prompt": "text"
+  }
+}
+```
+
+### Preference Dataset
+
+Preference datasets are used for reward modeling, DPO training and ORPO training.
+
+It requires a better response in `chosen` column and a worse response in `rejected` column.
+
+```json
+[
+  {
+    "instruction": "human instruction (required)",
+    "input": "human input (optional)",
+    "chosen": "chosen answer (required)",
+    "rejected": "rejected answer (required)"
+  }
+]
+```
+
+Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
+
+```json
+"dataset_name": {
+  "file_name": "data.json",
+  "ranking": true,
+  "columns": {
+    "prompt": "instruction",
+    "query": "input",
+    "chosen": "chosen",
+    "rejected": "rejected"
+  }
+}
+```
+
+### KTO Dataset
+
+- [Example dataset](kto_en_demo.json)
+
+KTO datasets require a extra `kto_tag` column containing the boolean human feedback.
+
+```json
+[
+  {
+    "instruction": "human instruction (required)",
+    "input": "human input (optional)",
+    "output": "model response (required)",
+    "kto_tag": "human feedback [true/false] (required)"
+  }
+]
+```
+
+Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
+
+```json
+"dataset_name": {
+  "file_name": "data.json",
+  "columns": {
+    "prompt": "instruction",
+    "query": "input",
+    "response": "output",
+    "kto_tag": "kto_tag"
+  }
+}
+```
+
+### Multimodal Dataset
+
+- [Example dataset](mllm_demo.json)
+
+Multimodal datasets require a `images` column containing the paths to the input images. Currently we only support one image.
+
+```json
+[
+  {
+    "instruction": "human instruction (required)",
+    "input": "human input (optional)",
+    "output": "model response (required)",
+    "images": [
+      "image path (required)"
+    ]
+  }
+]
+```
+
+Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
+
+```json
+"dataset_name": {
+  "file_name": "data.json",
+  "columns": {
+    "prompt": "instruction",
+    "query": "input",
+    "response": "output",
+    "images": "images"
+  }
+}
+```
+
+## Sharegpt Format
+
+### Supervised Fine-Tuning Dataset
+
+- [Example dataset](glaive_toolcall_en_demo.json)
+
+Compared to the alpaca format, the sharegpt format allows the datasets have **more roles**, such as human, gpt, observation and function. They are presented in a list of objects in the `conversations` column.
+
+Note that the human and observation should appear in odd positions, while gpt and function should appear in even positions.
+
+```json
+[
+  {
+    "conversations": [
+      {
+        "from": "human",
+        "value": "human instruction"
+      },
+      {
+        "from": "function_call",
+        "value": "tool arguments"
+      },
+      {
+        "from": "observation",
+        "value": "tool result"
+      },
+      {
+        "from": "gpt",
+        "value": "model response"
+      }
+    ],
+    "system": "system prompt (optional)",
+    "tools": "tool description (optional)"
+  }
+]
+```
+
+Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
+
+```json
+"dataset_name": {
+  "file_name": "data.json",
+  "formatting": "sharegpt",
+  "columns": {
+    "messages": "conversations",
+    "system": "system",
+    "tools": "tools"
+  }
+}
+```
+
+### Preference Dataset
+
+- [Example dataset](dpo_en_demo.json)
+
+Preference datasets in sharegpt format also require a better message in `chosen` column and a worse message in `rejected` column.
+
+```json
+[
+  {
+    "conversations": [
+      {
+        "from": "human",
+        "value": "human instruction"
+      },
+      {
+        "from": "gpt",
+        "value": "model response"
+      },
+      {
+        "from": "human",
+        "value": "human instruction"
+      }
+    ],
+    "chosen": {
+      "from": "gpt",
+      "value": "chosen answer (required)"
+    },
+    "rejected": {
+      "from": "gpt",
+      "value": "rejected answer (required)"
+    }
+  }
+]
+```
+
+Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
+
+```json
+"dataset_name": {
+  "file_name": "data.json",
+  "formatting": "sharegpt",
+  "ranking": true,
+  "columns": {
+    "messages": "conversations",
+    "chosen": "chosen",
+    "rejected": "rejected"
+  }
+}
+```
+
+### OpenAI Format
+
+The openai format is simply a special case of the sharegpt format, where the first message may be a system prompt.
+
+```json
+[
+  {
+    "messages": [
+      {
+        "role": "system",
+        "content": "system prompt (optional)"
+      },
+      {
+        "role": "user",
+        "content": "human instruction"
+      },
+      {
+        "role": "assistant",
+        "content": "model response"
+      }
+    ]
+  }
+]
+```
+
+Regarding the above dataset, the *dataset description* in `dataset_info.json` should be:
+
+```json
+"dataset_name": {
+  "file_name": "data.json",
+  "formatting": "sharegpt",
+  "columns": {
+    "messages": "messages"
+  },
+  "tags": {
+    "role_tag": "role",
+    "content_tag": "content",
+    "user_tag": "user",
+    "assistant_tag": "assistant",
+    "system_tag": "system"
+  }
+}
+```
+
+The KTO datasets and multimodal datasets in sharegpt format are similar to the alpaca format.
+
+Pre-training datasets are **incompatible** with the sharegpt format.
--- a/pretrain/LLaMA-Factory/data/README_zh.md
+++ b/pretrain/LLaMA-Factory/data/README_zh.md
+[dataset_info.json](dataset_info.json) 包含了所有可用的数据集。如果您希望使用自定义数据集，请**务必**在 `dataset_info.json` 文件中添加*数据集描述*，并通过修改 `dataset: 数据集名称` 配置来使用数据集。
+
+目前我们支持 **alpaca** 格式和 **sharegpt** 格式的数据集。
+
+```json
+"数据集名称": {
+  "hf_hub_url": "Hugging Face 的数据集仓库地址（若指定，则忽略 script_url 和 file_name）",
+  "ms_hub_url": "ModelScope 的数据集仓库地址（若指定，则忽略 script_url 和 file_name）",
+  "script_url": "包含数据加载脚本的本地文件夹名称（若指定，则忽略 file_name）",
+  "file_name": "该目录下数据集文件夹或文件的名称（若上述参数未指定，则此项必需）",
+  "formatting": "数据集格式（可选，默认：alpaca，可以为 alpaca 或 sharegpt）",
+  "ranking": "是否为偏好数据集（可选，默认：False）",
+  "subset": "数据集子集的名称（可选，默认：None）",
+  "split": "所使用的数据集切分（可选，默认：train）",
+  "folder": "Hugging Face 仓库的文件夹名称（可选，默认：None）",
+  "num_samples": "该数据集所使用的样本数量。（可选，默认：None）",
+  "columns（可选）": {
+    "prompt": "数据集代表提示词的表头名称（默认：instruction）",
+    "query": "数据集代表请求的表头名称（默认：input）",
+    "response": "数据集代表回答的表头名称（默认：output）",
+    "history": "数据集代表历史对话的表头名称（默认：None）",
+    "messages": "数据集代表消息列表的表头名称（默认：conversations）",
+    "system": "数据集代表系统提示的表头名称（默认：None）",
+    "tools": "数据集代表工具描述的表头名称（默认：None）",
+    "images": "数据集代表图像输入的表头名称（默认：None）",
+    "chosen": "数据集代表更优回答的表头名称（默认：None）",
+    "rejected": "数据集代表更差回答的表头名称（默认：None）",
+    "kto_tag": "数据集代表 KTO 标签的表头名称（默认：None）"
+  },
+  "tags（可选，用于 sharegpt 格式）": {
+    "role_tag": "消息中代表发送者身份的键名（默认：from）",
+    "content_tag": "消息中代表文本内容的键名（默认：value）",
+    "user_tag": "消息中代表用户的 role_tag（默认：human）",
+    "assistant_tag": "消息中代表助手的 role_tag（默认：gpt）",
+    "observation_tag": "消息中代表工具返回结果的 role_tag（默认：observation）",
+    "function_tag": "消息中代表工具调用的 role_tag（默认：function_call）",
+    "system_tag": "消息中代表系统提示的 role_tag（默认：system，会覆盖 system column）"
+  }
+}
+```
+
+## Alpaca 格式
+
+### 指令监督微调数据集
+
+- [样例数据集](alpaca_zh_demo.json)
+
+在指令监督微调时，`instruction` 列对应的内容会与 `input` 列对应的内容拼接后作为人类指令，即人类指令为 `instruction\ninput`。而 `output` 列对应的内容为模型回答。
+
+如果指定，`system` 列对应的内容将被作为系统提示词。
+
+`history` 列是由多个字符串二元组构成的列表，分别代表历史消息中每轮对话的指令和回答。注意在指令监督微调时，历史消息中的回答内容**也会被用于模型学习**。
+
+```json
+[
+  {
+    "instruction": "人类指令（必填）",
+    "input": "人类输入（选填）",
+    "output": "模型回答（必填）",
+    "system": "系统提示词（选填）",
+    "history": [
+      ["第一轮指令（选填）", "第一轮回答（选填）"],
+      ["第二轮指令（选填）", "第二轮回答（选填）"]
+    ]
+  }
+]
+```
+
+对于上述格式的数据，`dataset_info.json` 中的*数据集描述*应为：
+
+```json
+"数据集名称": {
+  "file_name": "data.json",
+  "columns": {
+    "prompt": "instruction",
+    "query": "input",
+    "response": "output",
+    "system": "system",
+    "history": "history"
+  }
+}
+```
+
+### 预训练数据集
+
+- [样例数据集](c4_demo.json)
+
+在预训练时，只有 `text` 列中的内容会用于模型学习。
+
+```json
+[
+  {"text": "document"},
+  {"text": "document"}
+]
+```
+
+对于上述格式的数据，`dataset_info.json` 中的*数据集描述*应为：
+
+```json
+"数据集名称": {
+  "file_name": "data.json",
+  "columns": {
+    "prompt": "text"
+  }
+}
+```
+
+### 偏好数据集
+
+偏好数据集用于奖励模型训练、DPO 训练和 ORPO 训练。
+
+它需要在 `chosen` 列中提供更优的回答，并在 `rejected` 列中提供更差的回答。
+
+```json
+[
+  {
+    "instruction": "人类指令（必填）",
+    "input": "人类输入（选填）",
+    "chosen": "优质回答（必填）",
+    "rejected": "劣质回答（必填）"
+  }
+]
+```
+
+对于上述格式的数据，`dataset_info.json` 中的*数据集描述*应为：
+
+```json
+"数据集名称": {
+  "file_name": "data.json",
+  "ranking": true,
+  "columns": {
+    "prompt": "instruction",
+    "query": "input",
+    "chosen": "chosen",
+    "rejected": "rejected"
+  }
+}
+```
+
+### KTO 数据集
+
+- [样例数据集](kto_en_demo.json)
+
+KTO 数据集需要额外添加一个 `kto_tag` 列，包含 bool 类型的人类反馈。
+
+```json
+[
+  {
+    "instruction": "人类指令（必填）",
+    "input": "人类输入（选填）",
+    "output": "模型回答（必填）",
+    "kto_tag": "人类反馈 [true/false]（必填）"
+  }
+]
+```
+
+对于上述格式的数据，`dataset_info.json` 中的*数据集描述*应为：
+
+```json
+"数据集名称": {
+  "file_name": "data.json",
+  "columns": {
+    "prompt": "instruction",
+    "query": "input",
+    "response": "output",
+    "kto_tag": "kto_tag"
+  }
+}
+```
+
+### 多模态数据集
+
+- [样例数据集](mllm_demo.json)
+
+多模态数据集需要额外添加一个 `images` 列，包含输入图像的路径。目前我们仅支持单张图像输入。
+
+```json
+[
+  {
+    "instruction": "人类指令（必填）",
+    "input": "人类输入（选填）",
+    "output": "模型回答（必填）",
+    "images": [
+      "图像路径（必填）"
+    ]
+  }
+]
+```
+
+对于上述格式的数据，`dataset_info.json` 中的*数据集描述*应为：
+
+```json
+"数据集名称": {
+  "file_name": "data.json",
+  "columns": {
+    "prompt": "instruction",
+    "query": "input",
+    "response": "output",
+    "images": "images"
+  }
+}
+```
+
+## Sharegpt 格式
+
+### 指令监督微调数据集
+
+- [样例数据集](glaive_toolcall_zh_demo.json)
+
+相比 alpaca 格式的数据集，sharegpt 格式支持**更多的角色种类**，例如 human、gpt、observation、function 等等。它们构成一个对象列表呈现在 `conversations` 列中。
+
+注意其中 human 和 observation 必须出现在奇数位置，gpt 和 function 必须出现在偶数位置。
+
+```json
+[
+  {
+    "conversations": [
+      {
+        "from": "human",
+        "value": "人类指令"
+      },
+      {
+        "from": "function_call",
+        "value": "工具参数"
+      },
+      {
+        "from": "observation",
+        "value": "工具结果"
+      },
+      {
+        "from": "gpt",
+        "value": "模型回答"
+      }
+    ],
+    "system": "系统提示词（选填）",
+    "tools": "工具描述（选填）"
+  }
+]
+```
+
+对于上述格式的数据，`dataset_info.json` 中的*数据集描述*应为：
+
+```json
+"数据集名称": {
+  "file_name": "data.json",
+  "formatting": "sharegpt",
+  "columns": {
+    "messages": "conversations",
+    "system": "system",
+    "tools": "tools"
+  }
+}
+```
+
+### 偏好数据集
+
+- [样例数据集](dpo_zh_demo.json)
+
+Sharegpt 格式的偏好数据集同样需要在 `chosen` 列中提供更优的消息，并在 `rejected` 列中提供更差的消息。
+
+```json
+[
+  {
+    "conversations": [
+      {
+        "from": "human",
+        "value": "人类指令"
+      },
+      {
+        "from": "gpt",
+        "value": "模型回答"
+      },
+      {
+        "from": "human",
+        "value": "人类指令"
+      }
+    ],
+    "chosen": {
+      "from": "gpt",
+      "value": "优质回答"
+    },
+    "rejected": {
+      "from": "gpt",
+      "value": "劣质回答"
+    }
+  }
+]
+```
+
+对于上述格式的数据，`dataset_info.json` 中的*数据集描述*应为：
+
+```json
+"数据集名称": {
+  "file_name": "data.json",
+  "formatting": "sharegpt",
+  "ranking": true,
+  "columns": {
+    "messages": "conversations",
+    "chosen": "chosen",
+    "rejected": "rejected"
+  }
+}
+```
+
+### OpenAI 格式
+
+OpenAI 格式仅仅是 sharegpt 格式的一种特殊情况，其中第一条消息可能是系统提示词。
+
+```json
+[
+  {
+    "messages": [
+      {
+        "role": "system",
+        "content": "系统提示词（选填）"
+      },
+      {
+        "role": "user",
+        "content": "人类指令"
+      },
+      {
+        "role": "assistant",
+        "content": "模型回答"
+      }
+    ]
+  }
+]
+```
+
+对于上述格式的数据，`dataset_info.json` 中的*数据集描述*应为：
+
+```json
+"数据集名称": {
+  "file_name": "data.json",
+  "formatting": "sharegpt",
+  "columns": {
+    "messages": "messages"
+  },
+  "tags": {
+    "role_tag": "role",
+    "content_tag": "content",
+    "user_tag": "user",
+    "assistant_tag": "assistant",
+    "system_tag": "system"
+  }
+}
+```
+
+Sharegpt 格式中的 KTO 数据集和多模态数据集与 alpaca 格式的类似。
+
+预训练数据集**不支持** sharegpt 格式。
--- a/pretrain/LLaMA-Factory/data/c4_demo.json
+++ b/pretrain/LLaMA-Factory/data/c4_demo.json
--- a/pretrain/LLaMA-Factory/data/dataset_info.json
+++ b/pretrain/LLaMA-Factory/data/dataset_info.json
+{
+  "c4_demo": {
+    "file_name": "c4_demo.json",
+    "columns": {
+      "prompt": "text"
+    }
+  }
+}
--- a/pretrain/LLaMA-Factory/examples/accelerate/fsdp_config.yaml
+++ b/pretrain/LLaMA-Factory/examples/accelerate/fsdp_config.yaml
-compute_environment: LOCAL_MACHINE
-debug: false
-distributed_type: FSDP
-downcast_bf16: 'no'
-fsdp_config:
-  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_backward_prefetch: BACKWARD_PRE
-  fsdp_forward_prefetch: false
-  fsdp_cpu_ram_efficient_loading: true
-  fsdp_offload_params: true # offload may affect training speed
-  fsdp_sharding_strategy: FULL_SHARD
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_sync_module_states: true
-  fsdp_use_orig_params: true
-machine_rank: 0
-main_training_function: main
-mixed_precision: fp16 # or bf16
-num_machines: 1 # the number of nodes
-num_processes: 2 # the number of GPUs in all nodes
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
--- a/pretrain/LLaMA-Factory/examples/deepspeed/ds_z0_config.json
+++ b/pretrain/LLaMA-Factory/examples/deepspeed/ds_z0_config.json
-{
-  "train_batch_size": "auto",
-  "train_micro_batch_size_per_gpu": "auto",
-  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
-  "zero_allow_untested_optimizer": true,
-  "fp16": {
-    "enabled": "auto",
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 16,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-  "bf16": {
-    "enabled": "auto"
-  },
-  "zero_optimization": {
-    "stage": 0,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 5e8,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 5e8,
-    "contiguous_gradients": true,
-    "round_robin_gradients": true
-  }
-}
\ No newline at end of file
--- a/pretrain/LLaMA-Factory/examples/deepspeed/ds_z2_config.json
+++ b/pretrain/LLaMA-Factory/examples/deepspeed/ds_z2_config.json
-{
-  "train_batch_size": "auto",
-  "train_micro_batch_size_per_gpu": "auto",
-  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
-  "zero_allow_untested_optimizer": true,
-  "fp16": {
-    "enabled": "auto",
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 16,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-  "bf16": {
-    "enabled": "auto"
-  },
-  "zero_optimization": {
-    "stage": 2,
-    "allgather_partitions": true,
-    "allgather_bucket_size": 5e8,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 5e8,
-    "contiguous_gradients": true,
-    "round_robin_gradients": true
-  }
-}
\ No newline at end of file
--- a/pretrain/LLaMA-Factory/examples/deepspeed/ds_z2_offload_config.json
+++ b/pretrain/LLaMA-Factory/examples/deepspeed/ds_z2_offload_config.json
-{
-  "train_batch_size": "auto",
-  "train_micro_batch_size_per_gpu": "auto",
-  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
-  "zero_allow_untested_optimizer": true,
-  "fp16": {
-    "enabled": "auto",
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 16,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-  "bf16": {
-    "enabled": "auto"
-  },
-  "zero_optimization": {
-    "stage": 2,
-    "offload_optimizer": {
-      "device": "cpu",
-      "pin_memory": true
-    },
-    "allgather_partitions": true,
-    "allgather_bucket_size": 5e8,
-    "overlap_comm": true,
-    "reduce_scatter": true,
-    "reduce_bucket_size": 5e8,
-    "contiguous_gradients": true,
-    "round_robin_gradients": true
-  }
-}
\ No newline at end of file
--- a/pretrain/LLaMA-Factory/examples/deepspeed/ds_z3_config.json
+++ b/pretrain/LLaMA-Factory/examples/deepspeed/ds_z3_config.json
-{
-  "train_batch_size": "auto",
-  "train_micro_batch_size_per_gpu": "auto",
-  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
-  "zero_allow_untested_optimizer": true,
-  "fp16": {
-    "enabled": "auto",
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 16,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-  "bf16": {
-    "enabled": "auto"
-  },
-  "zero_optimization": {
-    "stage": 3,
-    "overlap_comm": true,
-    "contiguous_gradients": true,
-    "sub_group_size": 1e9,
-    "reduce_bucket_size": "auto",
-    "stage3_prefetch_bucket_size": "auto",
-    "stage3_param_persistence_threshold": "auto",
-    "stage3_max_live_parameters": 1e9,
-    "stage3_max_reuse_distance": 1e9,
-    "stage3_gather_16bit_weights_on_model_save": true
-  }
-}
\ No newline at end of file
--- a/pretrain/LLaMA-Factory/examples/deepspeed/ds_z3_offload_config.json
+++ b/pretrain/LLaMA-Factory/examples/deepspeed/ds_z3_offload_config.json
-{
-  "train_batch_size": "auto",
-  "train_micro_batch_size_per_gpu": "auto",
-  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
-  "zero_allow_untested_optimizer": true,
-  "fp16": {
-    "enabled": "auto",
-    "loss_scale": 0,
-    "loss_scale_window": 1000,
-    "initial_scale_power": 16,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-  "bf16": {
-    "enabled": "auto"
-  },
-  "zero_optimization": {
-    "stage": 3,
-    "offload_optimizer": {
-      "device": "cpu",
-      "pin_memory": true
-    },
-    "offload_param": {
-      "device": "cpu",
-      "pin_memory": true
-    },
-    "overlap_comm": true,
-    "contiguous_gradients": true,
-    "sub_group_size": 1e9,
-    "reduce_bucket_size": "auto",
-    "stage3_prefetch_bucket_size": "auto",
-    "stage3_param_persistence_threshold": "auto",
-    "stage3_max_live_parameters": 1e9,
-    "stage3_max_reuse_distance": 1e9,
-    "stage3_gather_16bit_weights_on_model_save": true
-  }
-}
\ No newline at end of file
--- a/pretrain/LLaMA-Factory/examples/extras/adam_mini/qwen2_full_sft.yaml
+++ b/pretrain/LLaMA-Factory/examples/extras/adam_mini/qwen2_full_sft.yaml
-### model
-model_name_or_path: Qwen/Qwen2-1.5B-Instruct
-
-### method
-stage: sft
-do_train: true
-finetuning_type: full
-use_adam_mini: true
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: qwen
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/qwen2-1_5b/full/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-5
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/pretrain/LLaMA-Factory/examples/extras/badam/llama3_full_sft.yaml
+++ b/pretrain/LLaMA-Factory/examples/extras/badam/llama3_full_sft.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: sft
-do_train: true
-finetuning_type: full
-use_badam: true
-badam_mode: layer
-badam_switch_mode: ascending
-badam_switch_interval: 50
-badam_verbose: 2
-# deepspeed: examples/deepspeed/ds_z3_config.json
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/full/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-5
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/pretrain/LLaMA-Factory/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
+++ b/pretrain/LLaMA-Factory/examples/extras/fsdp_qlora/llama3_lora_sft.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-quantization_bit: 4
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/pretrain/LLaMA-Factory/examples/extras/fsdp_qlora/train.sh
+++ b/pretrain/LLaMA-Factory/examples/extras/fsdp_qlora/train.sh
-#!/bin/bash
-# DO NOT use GPTQ/AWQ model in FSDP+QLoRA
-
-CUDA_VISIBLE_DEVICES=0,1 accelerate launch \
-    --config_file examples/accelerate/fsdp_config.yaml \
-    src/train.py examples/extras/fsdp_qlora/llama3_lora_sft.yaml
--- a/pretrain/LLaMA-Factory/examples/extras/galore/llama3_full_sft.yaml
+++ b/pretrain/LLaMA-Factory/examples/extras/galore/llama3_full_sft.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: sft
-do_train: true
-finetuning_type: full
-use_galore: true
-galore_layerwise: true
-galore_target: mlp,self_attn
-galore_rank: 128
-galore_scale: 2.0
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/full/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 1
-learning_rate: 1.0e-5
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-pure_bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/pretrain/LLaMA-Factory/examples/extras/llama_pro/expand.sh
+++ b/pretrain/LLaMA-Factory/examples/extras/llama_pro/expand.sh
-#!/bin/bash
-
-python scripts/llama_pro.py \
-    --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
-    --output_dir models/llama3-8b-pro \
-    --num_expand 8
--- a/pretrain/LLaMA-Factory/examples/extras/llama_pro/llama3_freeze_sft.yaml
+++ b/pretrain/LLaMA-Factory/examples/extras/llama_pro/llama3_freeze_sft.yaml
-### model
-model_name_or_path: models/llama3-8b-pro
-
-### method
-stage: sft
-do_train: true
-finetuning_type: freeze
-freeze_trainable_layers: 8
-freeze_trainable_modules: all
-use_llama_pro: true
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b-pro/freeze/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/pretrain/LLaMA-Factory/examples/extras/loraplus/llama3_lora_sft.yaml
+++ b/pretrain/LLaMA-Factory/examples/extras/loraplus/llama3_lora_sft.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-loraplus_lr_ratio: 16.0
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/pretrain/LLaMA-Factory/examples/extras/mod/llama3_full_sft.yaml
+++ b/pretrain/LLaMA-Factory/examples/extras/mod/llama3_full_sft.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: sft
-do_train: true
-finetuning_type: full
-mixture_of_depths: convert
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b-mod/full/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-optim: paged_adamw_8bit
-learning_rate: 1.0e-5
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-pure_bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/pretrain/LLaMA-Factory/examples/extras/pissa/init.sh
+++ b/pretrain/LLaMA-Factory/examples/extras/pissa/init.sh
-#!/bin/bash
-
-python scripts/pissa_init.py \
-    --model_name_or_path meta-llama/Meta-Llama-3-8B-Instruct \
-    --output_dir models/llama3-8b-pissa
--- a/pretrain/LLaMA-Factory/examples/extras/pissa/llama3_lora_sft.yaml
+++ b/pretrain/LLaMA-Factory/examples/extras/pissa/llama3_lora_sft.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-pissa_init: true
-pissa_iter: 16
-pissa_convert: true
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/pretrain/LLaMA-Factory/examples/inference/llama3.yaml
+++ b/pretrain/LLaMA-Factory/examples/inference/llama3.yaml
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-template: llama3
--- a/pretrain/LLaMA-Factory/examples/inference/llama3_lora_sft.yaml
+++ b/pretrain/LLaMA-Factory/examples/inference/llama3_lora_sft.yaml
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-adapter_name_or_path: saves/llama3-8b/lora/sft
-template: llama3
-finetuning_type: lora
--- a/pretrain/LLaMA-Factory/examples/inference/llama3_vllm.yaml
+++ b/pretrain/LLaMA-Factory/examples/inference/llama3_vllm.yaml
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-template: llama3
-infer_backend: vllm
-vllm_enforce_eager: true
--- a/pretrain/LLaMA-Factory/examples/inference/llava1_5.yaml
+++ b/pretrain/LLaMA-Factory/examples/inference/llava1_5.yaml
-model_name_or_path: llava-hf/llava-1.5-7b-hf
-template: vicuna
-visual_inputs: true
--- a/pretrain/LLaMA-Factory/examples/merge_lora/llama3_gptq.yaml
+++ b/pretrain/LLaMA-Factory/examples/merge_lora/llama3_gptq.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-template: llama3
-
-### export
-export_dir: models/llama3_gptq
-export_quantization_bit: 4
-export_quantization_dataset: data/c4_demo.json
-export_size: 2
-export_device: cpu
-export_legacy_format: false
--- a/pretrain/LLaMA-Factory/examples/merge_lora/llama3_lora_sft.yaml
+++ b/pretrain/LLaMA-Factory/examples/merge_lora/llama3_lora_sft.yaml
-### Note: DO NOT use quantized model or quantization_bit when merging lora adapters
-
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-adapter_name_or_path: saves/llama3-8b/lora/sft
-template: llama3
-finetuning_type: lora
-
-### export
-export_dir: models/llama3_lora_sft
-export_size: 2
-export_device: cpu
-export_legacy_format: false
--- a/pretrain/LLaMA-Factory/examples/train_full/llama3_full_predict.yaml
+++ b/pretrain/LLaMA-Factory/examples/train_full/llama3_full_predict.yaml
-### model
-model_name_or_path: saves/llama3-8b/full/sft
-
-### method
-stage: sft
-do_predict: true
-finetuning_type: full
-
-### dataset
-eval_dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 50
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/full/predict
-overwrite_output_dir: true
-
-### eval
-per_device_eval_batch_size: 1
-predict_with_generate: true
--- a/pretrain/LLaMA-Factory/examples/train_full/llama3_full_sft_ds3.yaml
+++ b/pretrain/LLaMA-Factory/examples/train_full/llama3_full_sft_ds3.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: sft
-do_train: true
-finetuning_type: full
-deepspeed: examples/deepspeed/ds_z3_config.json
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/full/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 2
-learning_rate: 1.0e-5
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/pretrain/LLaMA-Factory/examples/train_lora/llama3_lora_dpo.yaml
+++ b/pretrain/LLaMA-Factory/examples/train_lora/llama3_lora_dpo.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: dpo
-do_train: true
-finetuning_type: lora
-lora_target: all
-pref_beta: 0.1
-pref_loss: sigmoid  # choices: [sigmoid (dpo), orpo, simpo]
-
-### dataset
-dataset: dpo_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/dpo
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 5.0e-6
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/pretrain/LLaMA-Factory/examples/train_lora/llama3_lora_eval.yaml
+++ b/pretrain/LLaMA-Factory/examples/train_lora/llama3_lora_eval.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-adapter_name_or_path: saves/llama3-8b/lora/sft
-
-### method
-finetuning_type: lora
-
-### dataset
-task: mmlu_test  # choices: [mmlu_test, ceval_validation, cmmlu_test]
-template: fewshot
-lang: en
-n_shot: 5
-
-### output
-save_dir: saves/llama3-8b/lora/eval
-
-### eval
-batch_size: 4
--- a/pretrain/LLaMA-Factory/examples/train_lora/llama3_lora_kto.yaml
+++ b/pretrain/LLaMA-Factory/examples/train_lora/llama3_lora_kto.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: kto
-do_train: true
-finetuning_type: lora
-lora_target: all
-pref_beta: 0.1
-
-### dataset
-dataset: kto_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/kto
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 5.0e-6
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/pretrain/LLaMA-Factory/examples/train_lora/llama3_lora_ppo.yaml
+++ b/pretrain/LLaMA-Factory/examples/train_lora/llama3_lora_ppo.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-reward_model: saves/llama3-8b/lora/reward
-
-### method
-stage: ppo
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/ppo
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-5
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### generate
-max_new_tokens: 512
-top_k: 0
-top_p: 0.9
--- a/pretrain/LLaMA-Factory/examples/train_lora/llama3_lora_predict.yaml
+++ b/pretrain/LLaMA-Factory/examples/train_lora/llama3_lora_predict.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-adapter_name_or_path: saves/llama3-8b/lora/sft
-
-### method
-stage: sft
-do_predict: true
-finetuning_type: lora
-
-### dataset
-eval_dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 50
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/predict
-overwrite_output_dir: true
-
-### eval
-per_device_eval_batch_size: 1
-predict_with_generate: true
-ddp_timeout: 180000000
--- a/pretrain/LLaMA-Factory/examples/train_lora/llama3_lora_pretrain.yaml
+++ b/pretrain/LLaMA-Factory/examples/train_lora/llama3_lora_pretrain.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: pt
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### dataset
-dataset: c4_demo
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/pretrain
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/pretrain/LLaMA-Factory/examples/train_lora/llama3_lora_reward.yaml
+++ b/pretrain/LLaMA-Factory/examples/train_lora/llama3_lora_reward.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: rm
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### dataset
-dataset: dpo_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/reward
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/pretrain/LLaMA-Factory/examples/train_lora/llama3_lora_sft.yaml
+++ b/pretrain/LLaMA-Factory/examples/train_lora/llama3_lora_sft.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/pretrain/LLaMA-Factory/examples/train_lora/llama3_lora_sft_ds0.yaml
+++ b/pretrain/LLaMA-Factory/examples/train_lora/llama3_lora_sft_ds0.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-deepspeed: examples/deepspeed/ds_z0_config.json
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 2
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/pretrain/LLaMA-Factory/examples/train_lora/llama3_lora_sft_ds3.yaml
+++ b/pretrain/LLaMA-Factory/examples/train_lora/llama3_lora_sft_ds3.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-deepspeed: examples/deepspeed/ds_z3_config.json
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 2
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/pretrain/LLaMA-Factory/examples/train_lora/llama3_preprocess.yaml
+++ b/pretrain/LLaMA-Factory/examples/train_lora/llama3_preprocess.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-tokenized_path: saves/llama3-8b/dataset/sft
-
-### output
-output_dir: saves/llama3-8b/lora/sft
-overwrite_output_dir: true
--- a/pretrain/LLaMA-Factory/examples/train_lora/llava1_5_lora_sft.yaml
+++ b/pretrain/LLaMA-Factory/examples/train_lora/llava1_5_lora_sft.yaml
-### model
-model_name_or_path: llava-hf/llava-1.5-7b-hf
-visual_inputs: true
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### dataset
-dataset: mllm_demo
-template: vicuna
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llava1_5-7b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/pretrain/LLaMA-Factory/examples/train_lora/qwen2vl_lora_sft.yaml
+++ b/pretrain/LLaMA-Factory/examples/train_lora/qwen2vl_lora_sft.yaml
-### model
-model_name_or_path: Qwen/Qwen2-VL-7B-Instruct
-visual_inputs: true
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### dataset
-dataset: mllm_demo
-template: qwen2_vl
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/qwen2_vl-7b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/pretrain/LLaMA-Factory/examples/train_qlora/llama3_lora_sft_aqlm.yaml
+++ b/pretrain/LLaMA-Factory/examples/train_qlora/llama3_lora_sft_aqlm.yaml
-### model
-model_name_or_path: ISTA-DASLab/Meta-Llama-3-8B-Instruct-AQLM-2Bit-1x16
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/pretrain/LLaMA-Factory/examples/train_qlora/llama3_lora_sft_awq.yaml
+++ b/pretrain/LLaMA-Factory/examples/train_qlora/llama3_lora_sft_awq.yaml
-### model
-model_name_or_path: TechxGenus/Meta-Llama-3-8B-Instruct-AWQ
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/pretrain/LLaMA-Factory/examples/train_qlora/llama3_lora_sft_gptq.yaml
+++ b/pretrain/LLaMA-Factory/examples/train_qlora/llama3_lora_sft_gptq.yaml
-### model
-model_name_or_path: TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/pretrain/LLaMA-Factory/examples/train_qlora/llama3_lora_sft_otfq.yaml
+++ b/pretrain/LLaMA-Factory/examples/train_qlora/llama3_lora_sft_otfq.yaml
-### model
-model_name_or_path: meta-llama/Meta-Llama-3-8B-Instruct
-quantization_bit: 4
-quantization_method: bitsandbytes  # choices: [bitsandbytes (4/8), hqq (2/3/4/5/6/8), eetq (8)]
-
-### method
-stage: sft
-do_train: true
-finetuning_type: lora
-lora_target: all
-
-### dataset
-dataset: identity,alpaca_en_demo
-template: llama3
-cutoff_len: 1024
-max_samples: 1000
-overwrite_cache: true
-preprocessing_num_workers: 16
-
-### output
-output_dir: saves/llama3-8b/lora/sft
-logging_steps: 10
-save_steps: 500
-plot_loss: true
-overwrite_output_dir: true
-
-### train
-per_device_train_batch_size: 1
-gradient_accumulation_steps: 8
-learning_rate: 1.0e-4
-num_train_epochs: 3.0
-lr_scheduler_type: cosine
-warmup_ratio: 0.1
-bf16: true
-ddp_timeout: 180000000
-
-### eval
-val_size: 0.1
-per_device_eval_batch_size: 1
-eval_strategy: steps
-eval_steps: 500
--- a/pretrain/LLaMA-Factory/pretrain.slurm
+++ b/pretrain/LLaMA-Factory/pretrain.slurm
+#!/bin/bash
+
+#- Job parameters
+
+# (TODO)
+# Please modify job name
+
+#- Resources
+
+# (TODO)
+# Please modify your requirements
+
+#SBATCH -p r8nv-gpu-dedicated        # Submit to 'nv-gpu' Partitiion
+#SBATCH -t 7-00:00:00                # Run for a maximum time of 0 days, 12 hours, 00 mins, 00 secs
+#SBATCH --nodes=4                    # Request N nodes
+#SBATCH --gres=gpu:8                 # Request M GPU per node
+#SBATCH --ntasks=4                   #
+### #SBATCH --ntasks-per-node=1      # Request P tasks per node
+#SBATCH --cpus-per-task=16           # Request Q core per task; means that P*Q cores per node
+#SBATCH --qos=normal                 # Request QOS Type
+### #SBATCH --constraint="IB&A100"
+#SBATCH --nodelist=r8a100-b[00,02,05-06]
+
+
+###
+### The system will alloc 8 or 16 cores per gpu by default.
+### If you need more or less, use following:
+### #SBATCH --cpus-per-task=K            # Request K cores
+###
+### 
+### Without specifying the constraint, any available nodes that meet the requirement will be allocated
+### You can specify the characteristics of the compute nodes, and even the names of the compute nodes
+###
+### #SBATCH --nodelist=r8a100-b[00,02,05-06]
+### #SBATCH --constraint="Volta|RTX8000" # Request GPU Type: Volta(V100 or V100S) or RTX8000
+###
+
+# set constraint for RTX8000 to meet my cuda
+
+#- Log information
+
+echo "Job start at $(date "+%Y-%m-%d %H:%M:%S")"
+echo "Job run at:"
+echo "$(hostnamectl)"
+
+#- Load environments
+source /tools/module_env.sh
+module list                       # list modules loaded
+
+##- Tools
+module load cluster-tools/v1.0
+module load slurm-tools/v1.0
+
+##- language
+module load python3/3.8.16
+
+##- CUDA
+module load cuda-cudnn/11.6-8.4.1
+
+##- virtualenv
+# source xxxxx/activate
+
+echo $(module list)              # list modules loaded
+echo $(which gcc)
+echo $(which python)
+echo $(which python3)
+
+cluster-quota                    # nas quota
+
+nvidia-smi --format=csv --query-gpu=name,driver_version,power.limit # gpu info
+
+CUDA_VISIBLE_DEVICES = 8
+#- Warning! Please not change your CUDA_VISIBLE_DEVICES
+#- in `.bashrc`, `env.sh`, or your job script
+echo "Use GPU ${CUDA_VISIBLE_DEVICES}"                              # which gpus
+#- The CUDA_VISIBLE_DEVICES variable is assigned and specified by SLURM
+# [EDIT HERE(TODO)]
+
+# export http_proxy=10.200.22.22:18421
+# export https_proxy=10.200.22.22:18421
+export GIT_SSL_NO_VERIFY=1
+export HF_ENDPOINT=https://hf-mirror.com
+
+master_node=$(scontrol show hostname $SLURM_NODELIST | head -n 1)
+
+# Resolve the hostname to an IP address
+master_ip=$(getent ahosts $master_node | grep "^[0-9]" | head -n 1 | awk '{ print $1 }')
+
+export TASK_NAME=minicpm_pretrain
+
+echo $master_node
+echo $master_ip
+
+scontrol show hostnames $SLURM_NODELIST
+
+srun bash -c "FORCE_TORCHRUN=1 NNODES=4 RANK=\$SLURM_NODEID MASTER_ADDR=$master_ip MASTER_PORT=29500 llamafactory-cli train configs/$TASK_NAME.yaml > ret_one/${TASK_NAME}_log.\$SLURM_NODEID 2>&1" 
+
+echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
--- a/pretrain/LLaMA-Factory/src/llamafactory/data/__init__.py
+++ b/pretrain/LLaMA-Factory/src/llamafactory/data/__init__.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .collator import (
+    CustomDataCollatorForSeq2Seq,
+    KTODataCollatorWithPadding,
+    PairwiseDataCollatorWithPadding,
+    SFTDataCollatorWith4DAttentionMask,
+)
+from .data_utils import Role, split_dataset
+from .loader import get_dataset
+from .template import TEMPLATES, Template, get_template_and_fix_tokenizer
+
+
+__all__ = [
+    "CustomDataCollatorForSeq2Seq",
+    "KTODataCollatorWithPadding",
+    "PairwiseDataCollatorWithPadding",
+    "SFTDataCollatorWith4DAttentionMask",
+    "Role",
+    "split_dataset",
+    "get_dataset",
+    "TEMPLATES",
+    "Template",
+    "get_template_and_fix_tokenizer",
+]
--- a/pretrain/LLaMA-Factory/src/llamafactory/data/aligner.py
+++ b/pretrain/LLaMA-Factory/src/llamafactory/data/aligner.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from functools import partial
+from typing import TYPE_CHECKING, Any, Dict, List, Union
+
+from datasets import Features
+
+from ..extras.logging import get_logger
+from .data_utils import Role
+
+
+if TYPE_CHECKING:
+    from datasets import Dataset, IterableDataset
+    from transformers import Seq2SeqTrainingArguments
+
+    from ..hparams import DataArguments
+    from .parser import DatasetAttr
+
+
+logger = get_logger(__name__)
+
+
+def _convert_images(images: List[Any], dataset_attr: "DatasetAttr", data_args: "DataArguments") -> List[Any]:
+    r"""
+    Optionally concatenates image path to dataset dir when loading from local disk.
+    """
+    outputs = []
+    if dataset_attr.load_from in ["script", "file"]:
+        for image in images:
+            if isinstance(image, str) and os.path.isfile(os.path.join(data_args.dataset_dir, image)):
+                outputs.append(os.path.join(data_args.dataset_dir, image))
+            else:
+                outputs.append(image)
+
+    return outputs
+
+
+def convert_alpaca(
+    examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr", data_args: "DataArguments"
+) -> Dict[str, List[Any]]:
+    r"""
+    Converts alpaca format dataset to the standard format.
+    """
+    outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": []}
+    convert_images = partial(_convert_images, dataset_attr=dataset_attr, data_args=data_args)
+    for i in range(len(examples[dataset_attr.prompt])):
+        prompt = []
+        if dataset_attr.history and isinstance(examples[dataset_attr.history][i], list):
+            for old_prompt, old_response in examples[dataset_attr.history][i]:
+                prompt.append({"role": Role.USER.value, "content": old_prompt})
+                prompt.append({"role": Role.ASSISTANT.value, "content": old_response})
+
+        content = []
+        if dataset_attr.prompt and examples[dataset_attr.prompt][i]:
+            content.append(examples[dataset_attr.prompt][i])
+
+        if dataset_attr.query and examples[dataset_attr.query][i]:
+            content.append(examples[dataset_attr.query][i])
+
+        prompt.append({"role": Role.USER.value, "content": "\n".join(content)})  # "prompt\nquery"
+
+        if dataset_attr.kto_tag and isinstance(examples[dataset_attr.kto_tag][i], bool):  # kto example
+            response = [{"role": Role.ASSISTANT.value, "content": examples[dataset_attr.response][i]}]
+            if examples[dataset_attr.kto_tag][i]:
+                response = response + [{"role": Role.ASSISTANT.value, "content": ""}]
+            else:
+                response = [{"role": Role.ASSISTANT.value, "content": ""}] + response
+        elif (
+            dataset_attr.ranking
+            and isinstance(examples[dataset_attr.chosen][i], str)
+            and isinstance(examples[dataset_attr.rejected][i], str)
+        ):  # pairwise example
+            response = [
+                {"role": Role.ASSISTANT.value, "content": examples[dataset_attr.chosen][i]},
+                {"role": Role.ASSISTANT.value, "content": examples[dataset_attr.rejected][i]},
+            ]
+        elif dataset_attr.response and isinstance(examples[dataset_attr.response][i], str):  # normal example
+            response = [{"role": Role.ASSISTANT.value, "content": examples[dataset_attr.response][i]}]
+        else:  # unsupervised
+            response = []
+
+        outputs["prompt"].append(prompt)
+        outputs["response"].append(response)
+        outputs["system"].append(examples[dataset_attr.system][i] if dataset_attr.system else "")
+        outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "")
+        outputs["images"].append(convert_images(examples[dataset_attr.images][i]) if dataset_attr.images else [])
+
+    return outputs
+
+
+def convert_sharegpt(
+    examples: Dict[str, List[Any]], dataset_attr: "DatasetAttr", data_args: "DataArguments"
+) -> Dict[str, List[Any]]:
+    r"""
+    Converts sharegpt format dataset to the standard format.
+    """
+    outputs = {"prompt": [], "response": [], "system": [], "tools": [], "images": []}
+    convert_images = partial(_convert_images, dataset_attr=dataset_attr, data_args=data_args)
+    tag_mapping = {
+        dataset_attr.user_tag: Role.USER.value,
+        dataset_attr.assistant_tag: Role.ASSISTANT.value,
+        dataset_attr.observation_tag: Role.OBSERVATION.value,
+        dataset_attr.function_tag: Role.FUNCTION.value,
+        dataset_attr.system_tag: Role.SYSTEM.value,
+    }
+    odd_tags = (dataset_attr.user_tag, dataset_attr.observation_tag)
+    even_tags = (dataset_attr.assistant_tag, dataset_attr.function_tag)
+    accept_tags = (odd_tags, even_tags)
+    for i, messages in enumerate(examples[dataset_attr.messages]):
+        if len(messages) == 0:
+            continue
+
+        if dataset_attr.system_tag and messages[0][dataset_attr.role_tag] == dataset_attr.system_tag:
+            system = messages[0][dataset_attr.content_tag]
+            messages = messages[1:]
+        else:
+            system = examples[dataset_attr.system][i] if dataset_attr.system else ""
+
+        aligned_messages = []
+        broken_data = False
+        for turn_idx, message in enumerate(messages):
+            if message[dataset_attr.role_tag] not in accept_tags[turn_idx % 2]:
+                logger.warning("Invalid role tag in {}.".format(messages))
+                broken_data = True
+
+            aligned_messages.append(
+                {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]}
+            )
+
+        if (not dataset_attr.ranking and len(aligned_messages) % 2 != 0) or (
+            dataset_attr.ranking and len(aligned_messages) % 2 == 0
+        ):
+            logger.warning("Invalid message count in {}.".format(messages))
+            broken_data = True
+
+        if dataset_attr.kto_tag and isinstance(examples[dataset_attr.kto_tag][i], bool):  # kto example
+            prompt = aligned_messages[:-1]
+            response = aligned_messages[-1:]
+            if examples[dataset_attr.kto_tag][i]:
+                response = response + [{"role": Role.ASSISTANT.value, "content": ""}]
+            else:
+                response = [{"role": Role.ASSISTANT.value, "content": ""}] + response
+        elif (
+            dataset_attr.ranking
+            and isinstance(examples[dataset_attr.chosen][i], dict)
+            and isinstance(examples[dataset_attr.rejected][i], dict)
+        ):  # pairwise example
+            chosen = examples[dataset_attr.chosen][i]
+            rejected = examples[dataset_attr.rejected][i]
+            if (
+                chosen[dataset_attr.role_tag] not in accept_tags[-1]
+                or rejected[dataset_attr.role_tag] not in accept_tags[-1]
+            ):
+                logger.warning("Invalid role tag in {}.".format([chosen, rejected]))
+                broken_data = True
+
+            prompt = aligned_messages
+            response = [
+                {"role": tag_mapping[chosen[dataset_attr.role_tag]], "content": chosen[dataset_attr.content_tag]},
+                {"role": tag_mapping[rejected[dataset_attr.role_tag]], "content": rejected[dataset_attr.content_tag]},
+            ]
+        else:  # normal example
+            prompt = aligned_messages[:-1]
+            response = aligned_messages[-1:]
+
+        if broken_data:
+            logger.warning("Skipping this abnormal example.")
+            continue
+
+        outputs["prompt"].append(prompt)
+        outputs["response"].append(response)
+        outputs["system"].append(system)
+        outputs["tools"].append(examples[dataset_attr.tools][i] if dataset_attr.tools else "")
+        outputs["images"].append(convert_images(examples[dataset_attr.images][i]) if dataset_attr.images else [])
+
+    return outputs
+
+
+def align_dataset(
+    dataset: Union["Dataset", "IterableDataset"],
+    dataset_attr: "DatasetAttr",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+) -> Union["Dataset", "IterableDataset"]:
+    r"""
+    Aligned dataset:
+        prompt: [{"role": "user", "content": "..."}] * (2T - 1)
+        response: [{"role": "assistant", "content": "..."}] * N (N > 1 for ranking dataset)
+        system: "..."
+        tools: "...",
+        images: [],
+    """
+    if dataset_attr.formatting == "alpaca":
+        convert_func = partial(convert_alpaca, dataset_attr=dataset_attr, data_args=data_args)
+    else:
+        convert_func = partial(convert_sharegpt, dataset_attr=dataset_attr, data_args=data_args)
+
+    column_names = list(next(iter(dataset)).keys())
+    features = Features.from_dict(
+        {
+            "prompt": [
+                {"role": {"dtype": "string", "_type": "Value"}, "content": {"dtype": "string", "_type": "Value"}}
+            ],
+            "response": [
+                {"role": {"dtype": "string", "_type": "Value"}, "content": {"dtype": "string", "_type": "Value"}}
+            ],
+            "system": {"dtype": "string", "_type": "Value"},
+            "tools": {"dtype": "string", "_type": "Value"},
+            "images": [{"_type": "Image"}],
+        }
+    )
+    kwargs = {}
+    if not data_args.streaming:
+        kwargs = dict(
+            num_proc=data_args.preprocessing_num_workers,
+            load_from_cache_file=(not data_args.overwrite_cache) or (training_args.local_process_index != 0),
+            desc="Converting format of dataset",
+        )
+
+    return dataset.map(
+        convert_func,
+        batched=True,
+        remove_columns=column_names,
+        features=features,
+        **kwargs,
+    )
--- a/pretrain/LLaMA-Factory/src/llamafactory/data/collator.py
+++ b/pretrain/LLaMA-Factory/src/llamafactory/data/collator.py
+# Copyright 2024 OpenAccess AI Collective and the LlamaFactory team.
+#
+# This code is inspired by the OpenAccess AI Collective's axolotl library.
+# https://github.com/OpenAccess-AI-Collective/axolotl/blob/main/src/axolotl/monkeypatch/utils.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Any, Dict, Literal, Sequence
+
+import torch
+from transformers import DataCollatorForSeq2Seq
+
+
+def prepare_4d_attention_mask(attention_mask_with_indices: "torch.Tensor", dtype: "torch.dtype") -> "torch.Tensor":
+    r"""
+    Expands the attention mask with indices from (batch_size, seq_len) to (batch_size, 1, seq_len, seq_len),
+    while handles packed sequences and transforms the mask to lower triangular form to prevent future peeking.
+
+    e.g.
+    ```python
+    # input
+    [[1, 1, 2, 2, 2, 0]]
+    # output
+    [
+        [
+            [
+                [o, x, x, x, x, x],
+                [o, o, x, x, x, x],
+                [x, x, o, x, x, x],
+                [x, x, o, o, x, x],
+                [x, x, o, o, o, x],
+                [x, x, x, x, x, x],
+            ]
+        ]
+    ]
+    ```
+    where `o` equals to `0.0`, `x` equals to `min_dtype`.
+    """
+    bsz, seq_len = attention_mask_with_indices.size()
+    min_dtype = torch.finfo(dtype).min
+    expanded_mask = attention_mask_with_indices[:, None, None, :].expand(bsz, 1, seq_len, seq_len)
+    # Create a binary mask from the original mask where zeros remain zeros and all other values are set to one
+    padding_mask = torch.where(expanded_mask != 0, 1, 0)
+    # Create a block-diagonal mask.
+    attention_mask_4d = torch.eq(expanded_mask, expanded_mask.transpose(-1, -2)).int() * padding_mask
+    # Use the lower triangular mask to zero out the upper triangular part
+    attention_mask_4d *= torch.tril(torch.ones((seq_len, seq_len), dtype=torch.long))
+    # Invert the attention mask.
+    attention_mask_4d = torch.where(attention_mask_4d != 0, torch.tensor(0, dtype=dtype), min_dtype)
+    return attention_mask_4d
+
+
+@dataclass
+class CustomDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
+    r"""
+    Data collator for custom models (like Qwen2-VL).
+    """
+
+    def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, "torch.Tensor"]:
+        image_grid_thw = None  # TODO: better handle various VLMs
+        if "image_grid_thw" in features[0]:
+            image_grid_thw_list = [
+                torch.Tensor(feature["image_grid_thw"]).long()
+                for feature in features
+                if feature["image_grid_thw"][0][0] > 0
+            ]
+            pixel_values_list = [
+                torch.Tensor(feature["pixel_values"]) for feature in features if feature["image_grid_thw"][0][0] > 0
+            ]
+            if image_grid_thw_list:
+                image_grid_thw = torch.cat(image_grid_thw_list, dim=0)
+                pixel_values = torch.cat(pixel_values_list, dim=0)
+            else:
+                image_grid_thw = None
+                pixel_values = None
+
+            features = [
+                {key: feature[key] for key in feature if key not in ["image_grid_thw", "pixel_values"]}
+                for feature in features
+            ]
+
+        features = super().__call__(features)
+        if image_grid_thw is not None:
+            features["image_grid_thw"] = image_grid_thw
+            features["pixel_values"] = pixel_values
+
+        return features
+
+
+@dataclass
+class SFTDataCollatorWith4DAttentionMask(CustomDataCollatorForSeq2Seq):
+    r"""
+    Data collator for 4d attention mask.
+    """
+
+    block_diag_attn: bool = False
+    attn_implementation: Literal["eager", "sdpa", "flash_attention_2"] = "eager"
+    compute_dtype: "torch.dtype" = torch.float32
+
+    def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, "torch.Tensor"]:
+        features = super().__call__(features)
+        if self.block_diag_attn and self.attn_implementation != "flash_attention_2":
+            features["attention_mask"] = prepare_4d_attention_mask(features["attention_mask"], self.compute_dtype)
+
+        return features
+
+
+@dataclass
+class PairwiseDataCollatorWithPadding(CustomDataCollatorForSeq2Seq):
+    r"""
+    Data collator for pairwise data.
+    """
+
+    def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, "torch.Tensor"]:
+        r"""
+        Pads batched data to the longest sequence in the batch.
+
+        We generate 2 * n examples where the first n examples represent chosen examples and
+        the last n examples represent rejected examples.
+        """
+        concatenated_features = []
+        for key in ("chosen", "rejected"):
+            for feature in features:
+                target_feature = {
+                    "input_ids": feature["{}_input_ids".format(key)],
+                    "attention_mask": feature["{}_attention_mask".format(key)],
+                    "labels": feature["{}_labels".format(key)],
+                }
+                if "pixel_values" in feature:  # image data are same for chosen and rejected
+                    target_feature["pixel_values"] = feature["pixel_values"]
+
+                if "image_grid_thw" in feature:
+                    target_feature["image_grid_thw"] = feature["image_grid_thw"]
+
+                if "{}_token_type_ids".format(key) in feature:
+                    target_feature["token_type_ids"] = feature["{}_token_type_ids".format(key)]
+
+                concatenated_features.append(target_feature)
+
+        return super().__call__(concatenated_features)
+
+
+@dataclass
+class KTODataCollatorWithPadding(CustomDataCollatorForSeq2Seq):
+    r"""
+    Data collator for KTO data.
+    """
+
+    def __call__(self, features: Sequence[Dict[str, Any]]) -> Dict[str, "torch.Tensor"]:
+        target_features = []
+        kl_features = []
+        kto_tags = []
+        for feature in features:
+            target_feature = {
+                "input_ids": feature["input_ids"],
+                "attention_mask": feature["attention_mask"],
+                "labels": feature["labels"],
+            }
+            kl_feature = {
+                "input_ids": feature["kl_input_ids"],
+                "attention_mask": feature["kl_attention_mask"],
+                "labels": feature["kl_labels"],
+            }
+            if "pixel_values" in feature:
+                target_feature["pixel_values"] = feature["pixel_values"]
+
+            if "image_grid_thw" in feature:
+                target_feature["image_grid_thw"] = feature["image_grid_thw"]
+
+            if "token_type_ids" in feature:
+                target_feature["token_type_ids"] = feature["token_type_ids"]
+                kl_feature["token_type_ids"] = feature["kl_token_type_ids"]
+
+            target_features.append(target_feature)
+            kl_features.append(kl_feature)
+            kto_tags.append(feature["kto_tags"])
+
+        batch = super().__call__(target_features)
+        kl_batch = super().__call__(kl_features)
+        batch["kl_input_ids"] = kl_batch["input_ids"]
+        batch["kl_attention_mask"] = kl_batch["attention_mask"]
+        batch["kl_labels"] = kl_batch["labels"]
+        if "token_type_ids" in batch:
+            batch["kl_token_type_ids"] = kl_batch["token_type_ids"]
+
+        batch["kto_tags"] = torch.tensor(kto_tags)
+        return batch
--- a/pretrain/LLaMA-Factory/src/llamafactory/data/data_utils.py
+++ b/pretrain/LLaMA-Factory/src/llamafactory/data/data_utils.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from enum import Enum, unique
+from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Set, TypedDict, Union
+
+from datasets import DatasetDict, concatenate_datasets, interleave_datasets
+
+from ..extras.logging import get_logger
+
+
+if TYPE_CHECKING:
+    from datasets import Dataset, IterableDataset
+
+    from ..hparams import DataArguments
+
+
+logger = get_logger(__name__)
+
+
+SLOTS = Sequence[Union[str, Set[str], Dict[str, str]]]
+
+
+@unique
+class Role(str, Enum):
+    USER = "user"
+    ASSISTANT = "assistant"
+    SYSTEM = "system"
+    FUNCTION = "function"
+    OBSERVATION = "observation"
+
+
+class DatasetModule(TypedDict):
+    train_dataset: Optional[Union["Dataset", "IterableDataset"]]
+    eval_dataset: Optional[Union["Dataset", "IterableDataset"]]
+
+
+def merge_dataset(
+    all_datasets: List[Union["Dataset", "IterableDataset"]], data_args: "DataArguments", seed: int
+) -> Union["Dataset", "IterableDataset"]:
+    if len(all_datasets) == 1:
+        return all_datasets[0]
+    elif data_args.mix_strategy == "concat":
+        if data_args.streaming:
+            logger.warning("The samples between different datasets will not be mixed in streaming mode.")
+
+        return concatenate_datasets(all_datasets)
+    elif data_args.mix_strategy.startswith("interleave"):
+        if not data_args.streaming:
+            logger.warning("We recommend using `mix_strategy=concat` in non-streaming mode.")
+
+        return interleave_datasets(
+            datasets=all_datasets,
+            probabilities=data_args.interleave_probs,
+            seed=seed,
+            stopping_strategy="first_exhausted" if data_args.mix_strategy.endswith("under") else "all_exhausted",
+        )
+    else:
+        raise ValueError("Unknown mixing strategy.")
+
+
+def split_dataset(
+    dataset: Union["Dataset", "IterableDataset"], data_args: "DataArguments", seed: int
+) -> "DatasetDict":
+    r"""
+    Splits the dataset and returns a dataset dict containing train set (required) and validation set (optional).
+    """
+    if data_args.streaming:
+        dataset = dataset.shuffle(buffer_size=data_args.buffer_size, seed=seed)
+        val_set = dataset.take(int(data_args.val_size))
+        train_set = dataset.skip(int(data_args.val_size))
+        return DatasetDict({"train": train_set, "validation": val_set})
+    else:
+        val_size = int(data_args.val_size) if data_args.val_size > 1 else data_args.val_size
+        dataset = dataset.train_test_split(test_size=val_size, seed=seed)
+        return DatasetDict({"train": dataset["train"], "validation": dataset["test"]})
--- a/pretrain/LLaMA-Factory/src/llamafactory/data/formatter.py
+++ b/pretrain/LLaMA-Factory/src/llamafactory/data/formatter.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import List, Literal, Optional, Tuple, Union
+
+from .data_utils import SLOTS
+from .tool_utils import DefaultToolUtils, GLM4ToolUtils
+
+
+@dataclass
+class Formatter(ABC):
+    slots: SLOTS = field(default_factory=list)
+    tool_format: Optional[Literal["default", "glm4"]] = None
+
+    @abstractmethod
+    def apply(self, **kwargs) -> SLOTS: ...
+
+    def extract(self, content: str) -> Union[str, List[Tuple[str, str]]]:
+        raise NotImplementedError
+
+
+@dataclass
+class EmptyFormatter(Formatter):
+    def __post_init__(self):
+        has_placeholder = False
+        for slot in filter(lambda s: isinstance(s, str), self.slots):
+            if re.search(r"\{\{[a-zA-Z_][a-zA-Z0-9_]*\}\}", slot):
+                has_placeholder = True
+
+        if has_placeholder:
+            raise ValueError("Empty formatter should not contain any placeholder.")
+
+    def apply(self, **kwargs) -> SLOTS:
+        return self.slots
+
+
+@dataclass
+class StringFormatter(Formatter):
+    def __post_init__(self):
+        has_placeholder = False
+        for slot in filter(lambda s: isinstance(s, str), self.slots):
+            if re.search(r"\{\{[a-zA-Z_][a-zA-Z0-9_]*\}\}", slot):
+                has_placeholder = True
+
+        if not has_placeholder:
+            raise ValueError("A placeholder is required in the string formatter.")
+
+    def apply(self, **kwargs) -> SLOTS:
+        elements = []
+        for slot in self.slots:
+            if isinstance(slot, str):
+                for name, value in kwargs.items():
+                    if not isinstance(value, str):
+                        raise RuntimeError("Expected a string, got {}".format(value))
+
+                    slot = slot.replace("{{" + name + "}}", value, 1)
+                elements.append(slot)
+            elif isinstance(slot, (dict, set)):
+                elements.append(slot)
+            else:
+                raise RuntimeError("Input must be string, set[str] or dict[str, str], got {}".format(type(slot)))
+
+        return elements
+
+
+@dataclass
+class FunctionFormatter(Formatter):
+    def __post_init__(self):
+        if self.tool_format == "default":
+            self.slots = DefaultToolUtils.get_function_slots() + self.slots
+        elif self.tool_format == "glm4":
+            self.slots = GLM4ToolUtils.get_function_slots() + self.slots
+        else:
+            raise NotImplementedError("Tool format {} was not found.".format(self.tool_format))
+
+    def apply(self, **kwargs) -> SLOTS:
+        content = kwargs.pop("content")
+        functions: List[Tuple[str, str]] = []
+        try:
+            tool_calls = json.loads(content)
+            if not isinstance(tool_calls, list):  # parallel function call
+                tool_calls = [tool_calls]
+
+            for tool_call in tool_calls:
+                functions.append((tool_call["name"], json.dumps(tool_call["arguments"], ensure_ascii=False)))
+
+        except json.JSONDecodeError:
+            functions = []
+
+        elements = []
+        for name, arguments in functions:
+            for slot in self.slots:
+                if isinstance(slot, str):
+                    slot = slot.replace("{{name}}", name).replace("{{arguments}}", arguments)
+                    elements.append(slot)
+                elif isinstance(slot, (dict, set)):
+                    elements.append(slot)
+                else:
+                    raise RuntimeError("Input must be string, set[str] or dict[str, str], got {}".format(type(slot)))
+
+        return elements
+
+
+@dataclass
+class ToolFormatter(Formatter):
+    def __post_init__(self):
+        if self.tool_format == "default":
+            self._tool_formatter = DefaultToolUtils.tool_formatter
+            self._tool_extractor = DefaultToolUtils.tool_extractor
+        elif self.tool_format == "glm4":
+            self._tool_formatter = GLM4ToolUtils.tool_formatter
+            self._tool_extractor = GLM4ToolUtils.tool_extractor
+        else:
+            raise NotImplementedError("Tool format {} was not found.".format(self.tool_format))
+
+    def apply(self, **kwargs) -> SLOTS:
+        content = kwargs.pop("content")
+        try:
+            tools = json.loads(content)
+            return [self._tool_formatter(tools) if len(tools) != 0 else ""]
+        except json.JSONDecodeError:
+            return [""]
+
+    def extract(self, content: str) -> Union[str, List[Tuple[str, str]]]:
+        return self._tool_extractor(content)
--- a/pretrain/LLaMA-Factory/src/llamafactory/data/loader.py
+++ b/pretrain/LLaMA-Factory/src/llamafactory/data/loader.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+from typing import TYPE_CHECKING, Dict, Literal, Optional, Sequence, Union
+
+import numpy as np
+from datasets import DatasetDict, load_dataset, load_from_disk
+from transformers.utils.versions import require_version
+
+from ..extras.constants import FILEEXT2TYPE
+from ..extras.logging import get_logger
+from ..extras.misc import has_tokenized_data
+from .aligner import align_dataset
+from .data_utils import merge_dataset, split_dataset
+from .parser import get_dataset_list
+from .preprocess import get_preprocess_and_print_func
+from .template import get_template_and_fix_tokenizer
+
+
+if TYPE_CHECKING:
+    from datasets import Dataset, IterableDataset
+    from transformers import PreTrainedTokenizer, ProcessorMixin, Seq2SeqTrainingArguments
+
+    from ..hparams import DataArguments, ModelArguments
+    from .data_utils import DatasetModule
+    from .parser import DatasetAttr
+    from .template import Template
+
+
+logger = get_logger(__name__)
+
+
+def _load_single_dataset(
+    dataset_attr: "DatasetAttr",
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+) -> Union["Dataset", "IterableDataset"]:
+    logger.info("Loading dataset {}...".format(dataset_attr))
+    data_path, data_name, data_dir, data_files = None, None, None, None
+    if dataset_attr.load_from in ["hf_hub", "ms_hub"]:
+        data_path = dataset_attr.dataset_name
+        data_name = dataset_attr.subset
+        data_dir = dataset_attr.folder
+
+    elif dataset_attr.load_from == "script":
+        data_path = os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)
+        data_name = dataset_attr.subset
+        data_dir = dataset_attr.folder
+
+    elif dataset_attr.load_from == "file":
+        data_files = []
+        local_path = os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)
+        if os.path.isdir(local_path):  # is directory
+            for file_name in os.listdir(local_path):
+                data_files.append(os.path.join(local_path, file_name))
+                if data_path is None:
+                    data_path = FILEEXT2TYPE.get(file_name.split(".")[-1], None)
+                elif data_path != FILEEXT2TYPE.get(file_name.split(".")[-1], None):
+                    raise ValueError("File types should be identical.")
+        elif os.path.isfile(local_path):  # is file
+            data_files.append(local_path)
+            data_path = FILEEXT2TYPE.get(local_path.split(".")[-1], None)
+        else:
+            raise ValueError("File {} not found.".format(local_path))
+
+        if data_path is None:
+            raise ValueError("Allowed file types: {}.".format(",".join(FILEEXT2TYPE.keys())))
+    else:
+        raise NotImplementedError("Unknown load type: {}.".format(dataset_attr.load_from))
+
+    if dataset_attr.load_from == "ms_hub":
+        require_version("modelscope>=1.11.0", "To fix: pip install modelscope>=1.11.0")
+        from modelscope import MsDataset
+        from modelscope.utils.config_ds import MS_DATASETS_CACHE
+
+        cache_dir = model_args.cache_dir or MS_DATASETS_CACHE
+        dataset = MsDataset.load(
+            dataset_name=data_path,
+            subset_name=data_name,
+            data_dir=data_dir,
+            data_files=data_files,
+            split=dataset_attr.split,
+            cache_dir=cache_dir,
+            token=model_args.ms_hub_token,
+            use_streaming=(data_args.streaming and (dataset_attr.load_from != "file")),
+        )
+        if isinstance(dataset, MsDataset):
+            dataset = dataset.to_hf_dataset()
+    else:
+        dataset = load_dataset(
+            path=data_path,
+            name=data_name,
+            data_dir=data_dir,
+            data_files=data_files,
+            split=dataset_attr.split,
+            cache_dir=model_args.cache_dir,
+            token=model_args.hf_hub_token,
+            streaming=(data_args.streaming and (dataset_attr.load_from != "file")),
+            trust_remote_code=True,
+        )
+
+    if data_args.streaming and (dataset_attr.load_from == "file"):  # faster than specifying streaming=True
+        dataset = dataset.to_iterable_dataset()  # TODO: add num shards parameter
+
+    if dataset_attr.num_samples is not None and not data_args.streaming:
+        target_num = dataset_attr.num_samples
+        indexes = np.random.permutation(len(dataset))[:target_num]
+        target_num -= len(indexes)
+        if target_num > 0:
+            expand_indexes = np.random.choice(len(dataset), target_num)
+            indexes = np.concatenate((indexes, expand_indexes), axis=0)
+
+        assert len(indexes) == dataset_attr.num_samples, "Sample num mismatched."
+        dataset = dataset.select(indexes)
+        logger.info("Sampled {} examples from dataset {}.".format(dataset_attr.num_samples, dataset_attr))
+
+    if data_args.max_samples is not None:  # truncate dataset
+        max_samples = min(data_args.max_samples, len(dataset))
+        dataset = dataset.select(range(max_samples))
+
+    return align_dataset(dataset, dataset_attr, data_args, training_args)
+
+
+def _get_merged_dataset(
+    dataset_names: Optional[Sequence[str]],
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    stage: Literal["pt", "sft", "rm", "ppo", "kto"],
+) -> Optional[Union["Dataset", "IterableDataset"]]:
+    if dataset_names is None:
+        return None
+
+    datasets = []
+    for dataset_attr in get_dataset_list(dataset_names, data_args.dataset_dir):
+        if (stage == "rm" and dataset_attr.ranking is False) or (stage != "rm" and dataset_attr.ranking is True):
+            raise ValueError("The dataset is not applicable in the current training stage.")
+
+        datasets.append(_load_single_dataset(dataset_attr, model_args, data_args, training_args))
+
+    return merge_dataset(datasets, data_args, seed=training_args.seed)
+
+
+def _get_preprocessed_dataset(
+    dataset: Optional[Union["Dataset", "IterableDataset"]],
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    stage: Literal["pt", "sft", "rm", "ppo", "kto"],
+    template: "Template",
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"] = None,
+    is_eval: bool = False,
+) -> Optional[Union["Dataset", "IterableDataset"]]:
+    if dataset is None:
+        return None
+
+    preprocess_func, print_function = get_preprocess_and_print_func(
+        data_args, stage, template, tokenizer, processor, do_generate=(training_args.predict_with_generate and is_eval)
+    )
+    column_names = list(next(iter(dataset)).keys())
+    kwargs = {}
+    if not data_args.streaming:
+        kwargs = dict(
+            num_proc=data_args.preprocessing_num_workers,
+            load_from_cache_file=(not data_args.overwrite_cache) or (training_args.local_process_index != 0),
+            desc="Running tokenizer on dataset",
+        )
+
+    dataset = dataset.map(preprocess_func, batched=True, remove_columns=column_names, **kwargs)
+
+    if training_args.should_log:
+        try:
+            print("eval example:" if is_eval else "training example:")
+            print_function(next(iter(dataset)))
+        except StopIteration:
+            if stage == "pt":
+                raise RuntimeError("Cannot find sufficient samples, consider increasing dataset size.")
+            else:
+                raise RuntimeError("Cannot find valid samples, check `data/README.md` for the data format.")
+
+    return dataset
+
+
+def get_dataset(
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    stage: Literal["pt", "sft", "rm", "ppo", "kto"],
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"] = None,
+) -> "DatasetModule":
+    template = get_template_and_fix_tokenizer(tokenizer, data_args.template, data_args.tool_format)
+    if data_args.train_on_prompt and template.efficient_eos:
+        raise ValueError("Current template does not support `train_on_prompt`.")
+
+    # Load tokenized dataset
+    if data_args.tokenized_path is not None:
+        if has_tokenized_data(data_args.tokenized_path):
+            logger.warning("Loading dataset from disk will ignore other data arguments.")
+            dataset_dict: "DatasetDict" = load_from_disk(data_args.tokenized_path)
+            logger.info("Loaded tokenized dataset from {}.".format(data_args.tokenized_path))
+
+            dataset_module: Dict[str, "Dataset"] = {}
+            if "train" in dataset_dict:
+                dataset_module["train_dataset"] = dataset_dict["train"]
+            if "validation" in dataset_dict:
+                dataset_module["eval_dataset"] = dataset_dict["validation"]
+
+            if data_args.streaming:
+                dataset_module = {k: v.to_iterable_dataset() for k, v in dataset_module.items()}
+
+            return dataset_module
+
+        if data_args.streaming:
+            raise ValueError("Turn off `streaming` when saving dataset to disk.")
+
+    # Load and preprocess dataset
+    with training_args.main_process_first(desc="load dataset"):
+        dataset = _get_merged_dataset(data_args.dataset, model_args, data_args, training_args, stage)
+        eval_dataset = _get_merged_dataset(data_args.eval_dataset, model_args, data_args, training_args, stage)
+
+    with training_args.main_process_first(desc="pre-process dataset"):
+        dataset = _get_preprocessed_dataset(
+            dataset, data_args, training_args, stage, template, tokenizer, processor, is_eval=False
+        )
+        eval_dataset = _get_preprocessed_dataset(
+            eval_dataset, data_args, training_args, stage, template, tokenizer, processor, is_eval=True
+        )
+
+        if data_args.val_size > 1e-6:
+            dataset_dict = split_dataset(dataset, data_args, seed=training_args.seed)
+        else:
+            dataset_dict = {}
+            if dataset is not None:
+                if data_args.streaming:
+                    dataset = dataset.shuffle(buffer_size=data_args.buffer_size, seed=training_args.seed)
+
+                dataset_dict["train"] = dataset
+
+            if eval_dataset is not None:
+                if data_args.streaming:
+                    eval_dataset = eval_dataset.shuffle(buffer_size=data_args.buffer_size, seed=training_args.seed)
+
+                dataset_dict["validation"] = eval_dataset
+
+            dataset_dict = DatasetDict(dataset_dict)
+
+        if data_args.tokenized_path is not None:
+            if training_args.should_save:
+                dataset_dict.save_to_disk(data_args.tokenized_path)
+                logger.info("Tokenized dataset saved at {}.".format(data_args.tokenized_path))
+                logger.info("Please restart the training with `tokenized_path: {}`.".format(data_args.tokenized_path))
+
+            sys.exit(0)
+
+        dataset_module = {}
+        if "train" in dataset_dict:
+            dataset_module["train_dataset"] = dataset_dict["train"]
+        if "validation" in dataset_dict:
+            dataset_module["eval_dataset"] = dataset_dict["validation"]
+
+        return dataset_module
--- a/pretrain/LLaMA-Factory/src/llamafactory/data/mm_plugin.py
+++ b/pretrain/LLaMA-Factory/src/llamafactory/data/mm_plugin.py
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
+
+from PIL.Image import Image
+from transformers import ProcessorMixin
+
+from ..extras.constants import IGNORE_INDEX, IMAGE_PLACEHOLDER
+from ..extras.packages import is_pillow_available
+
+
+if is_pillow_available():
+    import torch
+    from PIL import Image
+
+
+if TYPE_CHECKING:
+    from PIL.Image import Image as ImageObject
+    from transformers import PreTrainedTokenizer, ProcessorMixin
+    from transformers.image_processing_utils import BaseImageProcessor
+
+
+def get_pixel_values(images: Sequence["ImageObject"], processor: "ProcessorMixin") -> "torch.Tensor":
+    r"""
+    Processes visual inputs. (currently only supports a single image)
+
+    Returns:
+        pixel_values: tensor with shape (B, C, H, W)
+    """
+    image_processor: "BaseImageProcessor" = getattr(processor, "image_processor")
+    image = images[0] if len(images) != 0 else Image.new("RGB", (100, 100), (255, 255, 255))
+    return image_processor([image], return_tensors="pt")["pixel_values"]
+
+
+def get_paligemma_token_type_ids(input_len: int, processor: "ProcessorMixin") -> List[List[int]]:
+    r"""
+    Gets paligemma token type ids for computing loss.
+
+    Returns:
+        token_type_ids: shape (1, seq_len)
+    """
+    image_seq_length = getattr(processor, "image_seq_length")
+    return [[0] * image_seq_length + [1] * (input_len - image_seq_length)]
+
+
+def get_qwen2vl_image_inputs(
+    images: Sequence["ImageObject"], processor: "ProcessorMixin"
+) -> Dict[str, "torch.Tensor"]:
+    r"""
+    Processes qwen2-vl visual inputs. Supports multiple images.
+
+    Returns:
+        pixel_values: tensor with shape (num_patches, patch_dim)
+        image_grid_thw: tensot with shape (num_images, 3), where the three numbers are time, width, height
+
+    It holds num_patches == torch.prod(image_grid_thw)
+    """
+    image_processor: "BaseImageProcessor" = getattr(processor, "image_processor")
+    if len(images) != 0:
+        image_inputs = image_processor(images=images, return_tensors="pt")
+    else:
+        image = Image.new("RGB", (56, 56), (255, 255, 255))
+        image_inputs = image_processor(images=[image], return_tensors="pt")
+        image_inputs["image_grid_thw"][0][0] = 0  # fake image
+
+    return {"pixel_values": image_inputs["pixel_values"], "image_grid_thw": image_inputs["image_grid_thw"]}
+
+
+class BasePlugin:
+    def __init__(self, image_token: str) -> None:
+        self.image_token = image_token
+
+    def process_messages(
+        self,
+        messages: Sequence[Dict[str, str]],
+        images: Sequence["ImageObject"],
+        processor: Optional["ProcessorMixin"],
+    ) -> List[Dict[str, str]]:
+        r"""
+        Pre-processes input messages before tokenization for VLMs.
+        """
+        return messages
+
+    def process_token_ids(
+        self,
+        input_ids: List[int],
+        labels: Optional[List[int]],
+        tokenizer: "PreTrainedTokenizer",
+        processor: Optional["ProcessorMixin"],
+    ) -> Tuple[List[int], Optional[List[int]]]:
+        r"""
+        Pre-processes token ids after tokenization for VLMs.
+        """
+        return input_ids, labels
+
+    def get_mm_inputs(
+        self,
+        images: Sequence["ImageObject"],
+        feature_seqlens: Dict[str, int],
+        processor: Optional["ProcessorMixin"],
+    ) -> Dict[str, Any]:
+        r"""
+        Builds batched multimodal inputs for VLMs.
+        """
+        return {}
+
+    def process_model_inputs(
+        self,
+        model_inputs: Dict[str, List[Any]],
+        images: Sequence["ImageObject"],
+        feature_seqlens: Dict[str, int],
+        processor: Optional["ProcessorMixin"],
+    ) -> None:
+        r"""
+        Appends multimodal inputs to model inputs for VLMs.
+        """
+        return
+
+
+class LlavaPlugin(BasePlugin):
+    def process_messages(
+        self,
+        messages: Sequence[Dict[str, str]],
+        images: Sequence["ImageObject"],
+        processor: Optional["ProcessorMixin"],
+    ) -> List[Dict[str, str]]:
+        image_count = 0
+        new_messages = []
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                image_count += 1
+                if image_count > 1:
+                    raise ValueError("Llava model only accepts one image per sample.")
+
+                content = content.replace(IMAGE_PLACEHOLDER, self.image_token, 1)
+
+            new_messages.append({"role": message["role"], "content": content})
+
+        return new_messages
+
+    def get_mm_inputs(
+        self,
+        images: Sequence["ImageObject"],
+        feature_seqlens: Dict[str, int],
+        processor: Optional["ProcessorMixin"],
+    ) -> Dict[str, Any]:
+        return {"pixel_values": get_pixel_values(images, processor)}
+
+    def process_model_inputs(
+        self,
+        model_inputs: Dict[str, List[Any]],
+        images: Sequence["ImageObject"],
+        feature_seqlens: Dict[str, int],
+        processor: Optional["ProcessorMixin"],
+    ) -> None:
+        mm_inputs = self.get_mm_inputs(images, feature_seqlens, processor)
+        model_inputs["pixel_values"].append(mm_inputs["pixel_values"][0])
+
+
+class PaliGemmaPlugin(BasePlugin):
+    def process_messages(
+        self,
+        messages: Sequence[Dict[str, str]],
+        images: Sequence["ImageObject"],
+        processor: Optional["ProcessorMixin"],
+    ) -> List[Dict[str, str]]:
+        image_count = 0
+        new_messages = []
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                image_count += 1
+                if image_count > 1:
+                    raise ValueError("PaliGemma model only accepts one image per sample.")
+
+                content = content.replace(IMAGE_PLACEHOLDER, "", 1)
+
+            new_messages.append({"role": message["role"], "content": content})
+
+        return new_messages
+
+    def process_token_ids(
+        self,
+        input_ids: List[int],
+        labels: Optional[List[int]],
+        tokenizer: "PreTrainedTokenizer",
+        processor: Optional["ProcessorMixin"],
+    ) -> Tuple[List[int], Optional[List[int]]]:
+        image_processor: "BaseImageProcessor" = getattr(processor, "image_processor")
+        image_seq_length: int = getattr(image_processor, "image_seq_length")
+        image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
+        input_ids = [image_token_id] * image_seq_length + input_ids
+        if labels is not None:
+            labels = [IGNORE_INDEX] * image_seq_length + labels
+
+        return input_ids, labels
+
+    def get_mm_inputs(
+        self,
+        images: Sequence["ImageObject"],
+        feature_seqlens: Dict[str, int],
+        processor: Optional["ProcessorMixin"],
+    ) -> Dict[str, Any]:
+        mm_inputs = {"pixel_values": get_pixel_values(images, processor)}
+        for feature_name, feature_length in feature_seqlens.items():
+            mm_inputs[feature_name] = get_paligemma_token_type_ids(feature_length, processor)
+
+        return mm_inputs
+
+    def process_model_inputs(
+        self,
+        model_inputs: Dict[str, List[Any]],
+        images: Sequence["ImageObject"],
+        feature_seqlens: Dict[str, int],
+        processor: Optional["ProcessorMixin"],
+    ) -> None:
+        mm_inputs = self.get_mm_inputs(images, feature_seqlens, processor)
+        model_inputs["pixel_values"].append(mm_inputs["pixel_values"][0])
+        for feature_name in feature_seqlens.keys():
+            model_inputs[feature_name].append(mm_inputs[feature_name][0])
+
+
+class Qwen2vlPlugin(BasePlugin):
+    def process_messages(
+        self,
+        messages: Sequence[Dict[str, str]],
+        images: Sequence["ImageObject"],
+        processor: Optional["ProcessorMixin"],
+    ) -> List[Dict[str, str]]:
+        image_processor: "BaseImageProcessor" = getattr(processor, "image_processor")
+        merge_length: int = getattr(image_processor, "merge_size") ** 2
+        if len(images) > 0:
+            image_grid_thw = get_qwen2vl_image_inputs(images, processor)["image_grid_thw"]
+
+        index = 0
+        new_messages = []
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                content = content.replace(
+                    IMAGE_PLACEHOLDER,
+                    "<|vision_start|>{}<|vision_end|>".format(
+                        self.image_token * (image_grid_thw[index].prod() // merge_length)
+                    ),
+                    1,
+                )
+                index += 1
+
+            new_messages.append({"role": message["role"], "content": content})
+
+        return new_messages
+
+    def get_mm_inputs(
+        self,
+        images: Sequence["ImageObject"],
+        feature_seqlens: Dict[str, int],
+        processor: Optional["ProcessorMixin"],
+    ) -> Dict[str, Any]:
+        return get_qwen2vl_image_inputs(images, processor)
+
+    def process_model_inputs(
+        self,
+        model_inputs: Dict[str, List[Any]],
+        images: Sequence["ImageObject"],
+        feature_seqlens: Dict[str, int],
+        processor: Optional["ProcessorMixin"],
+    ) -> None:
+        mm_inputs = self.get_mm_inputs(images, feature_seqlens, processor)
+        model_inputs["pixel_values"].append(mm_inputs["pixel_values"])
+        model_inputs["image_grid_thw"].append(mm_inputs["image_grid_thw"])
+
+
+PLUGINS = {
+    "llava": LlavaPlugin,
+    "paligemma": PaliGemmaPlugin,
+    "qwen2_vl": Qwen2vlPlugin,
+}
+
+
+def get_mm_plugin(name: str, image_token: str) -> "BasePlugin":
+    if name not in PLUGINS:
+        raise ValueError("{} not found.".format(name))
+
+    return PLUGINS[name](image_token)
--- a/pretrain/LLaMA-Factory/src/llamafactory/data/parser.py
+++ b/pretrain/LLaMA-Factory/src/llamafactory/data/parser.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from dataclasses import dataclass
+from typing import Any, Dict, List, Literal, Optional, Sequence
+
+from transformers.utils import cached_file
+
+from ..extras.constants import DATA_CONFIG
+from ..extras.misc import use_modelscope
+
+
+@dataclass
+class DatasetAttr:
+    r"""
+    Dataset attributes.
+    """
+
+    # basic configs
+    load_from: Literal["hf_hub", "ms_hub", "script", "file"]
+    dataset_name: str
+    formatting: Literal["alpaca", "sharegpt"] = "alpaca"
+    ranking: bool = False
+    # extra configs
+    subset: Optional[str] = None
+    split: str = "train"
+    folder: Optional[str] = None
+    num_samples: Optional[int] = None
+    # common columns
+    system: Optional[str] = None
+    tools: Optional[str] = None
+    images: Optional[str] = None
+    # rlhf columns
+    chosen: Optional[str] = None
+    rejected: Optional[str] = None
+    kto_tag: Optional[str] = None
+    # alpaca columns
+    prompt: Optional[str] = "instruction"
+    query: Optional[str] = "input"
+    response: Optional[str] = "output"
+    history: Optional[str] = None
+    # sharegpt columns
+    messages: Optional[str] = "conversations"
+    # sharegpt tags
+    role_tag: Optional[str] = "from"
+    content_tag: Optional[str] = "value"
+    user_tag: Optional[str] = "human"
+    assistant_tag: Optional[str] = "gpt"
+    observation_tag: Optional[str] = "observation"
+    function_tag: Optional[str] = "function_call"
+    system_tag: Optional[str] = "system"
+
+    def __repr__(self) -> str:
+        return self.dataset_name
+
+    def set_attr(self, key: str, obj: Dict[str, Any], default: Optional[Any] = None) -> None:
+        setattr(self, key, obj.get(key, default))
+
+
+def get_dataset_list(dataset_names: Optional[Sequence[str]], dataset_dir: str) -> List["DatasetAttr"]:
+    r"""
+    Gets the attributes of the datasets.
+    """
+    if dataset_names is None:
+        dataset_names = []
+
+    if dataset_dir == "ONLINE":
+        dataset_info = None
+    else:
+        if dataset_dir.startswith("REMOTE:"):
+            config_path = cached_file(path_or_repo_id=dataset_dir[7:], filename=DATA_CONFIG, repo_type="dataset")
+        else:
+            config_path = os.path.join(dataset_dir, DATA_CONFIG)
+
+        try:
+            with open(config_path, "r") as f:
+                dataset_info = json.load(f)
+        except Exception as err:
+            if len(dataset_names) != 0:
+                raise ValueError("Cannot open {} due to {}.".format(config_path, str(err)))
+
+            dataset_info = None
+
+    dataset_list: List["DatasetAttr"] = []
+    for name in dataset_names:
+        if dataset_info is None:  # dataset_dir is ONLINE
+            load_from = "ms_hub" if use_modelscope() else "hf_hub"
+            dataset_attr = DatasetAttr(load_from, dataset_name=name)
+            dataset_list.append(dataset_attr)
+            continue
+
+        if name not in dataset_info:
+            raise ValueError("Undefined dataset {} in {}.".format(name, DATA_CONFIG))
+
+        has_hf_url = "hf_hub_url" in dataset_info[name]
+        has_ms_url = "ms_hub_url" in dataset_info[name]
+
+        if has_hf_url or has_ms_url:
+            if (use_modelscope() and has_ms_url) or (not has_hf_url):
+                dataset_attr = DatasetAttr("ms_hub", dataset_name=dataset_info[name]["ms_hub_url"])
+            else:
+                dataset_attr = DatasetAttr("hf_hub", dataset_name=dataset_info[name]["hf_hub_url"])
+        elif "script_url" in dataset_info[name]:
+            dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"])
+        else:
+            dataset_attr = DatasetAttr("file", dataset_name=dataset_info[name]["file_name"])
+
+        dataset_attr.set_attr("formatting", dataset_info[name], default="alpaca")
+        dataset_attr.set_attr("ranking", dataset_info[name], default=False)
+        dataset_attr.set_attr("subset", dataset_info[name])
+        dataset_attr.set_attr("split", dataset_info[name], default="train")
+        dataset_attr.set_attr("folder", dataset_info[name])
+        dataset_attr.set_attr("num_samples", dataset_info[name])
+
+        if "columns" in dataset_info[name]:
+            column_names = ["system", "tools", "images", "chosen", "rejected", "kto_tag"]
+            if dataset_attr.formatting == "alpaca":
+                column_names.extend(["prompt", "query", "response", "history"])
+            else:
+                column_names.extend(["messages"])
+
+            for column_name in column_names:
+                dataset_attr.set_attr(column_name, dataset_info[name]["columns"])
+
+        if dataset_attr.formatting == "sharegpt" and "tags" in dataset_info[name]:
+            tag_names = (
+                "role_tag",
+                "content_tag",
+                "user_tag",
+                "assistant_tag",
+                "observation_tag",
+                "function_tag",
+                "system_tag",
+            )
+            for tag in tag_names:
+                dataset_attr.set_attr(tag, dataset_info[name]["tags"])
+
+        dataset_list.append(dataset_attr)
+
+    return dataset_list
--- a/pretrain/LLaMA-Factory/src/llamafactory/data/preprocess.py
+++ b/pretrain/LLaMA-Factory/src/llamafactory/data/preprocess.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+from typing import TYPE_CHECKING, Callable, Literal, Optional, Tuple
+
+from .processors.feedback import preprocess_feedback_dataset
+from .processors.pairwise import preprocess_pairwise_dataset, print_pairwise_dataset_example
+from .processors.pretrain import preprocess_pretrain_dataset
+from .processors.supervised import (
+    preprocess_packed_supervised_dataset,
+    preprocess_supervised_dataset,
+    print_supervised_dataset_example,
+)
+from .processors.unsupervised import preprocess_unsupervised_dataset, print_unsupervised_dataset_example
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer, ProcessorMixin
+
+    from ..hparams import DataArguments
+    from .template import Template
+
+
+def get_preprocess_and_print_func(
+    data_args: "DataArguments",
+    stage: Literal["pt", "sft", "rm", "ppo", "kto"],
+    template: "Template",
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"],
+    do_generate: bool = False,
+) -> Tuple[Callable, Callable]:
+    if stage == "pt":
+        preprocess_func = partial(
+            preprocess_pretrain_dataset,
+            tokenizer=tokenizer,
+            data_args=data_args,
+        )
+        print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer)
+    elif stage == "sft" and not do_generate:
+        if data_args.packing:
+            if data_args.neat_packing:
+                from datasets.arrow_writer import OptimizedTypedSequence, TypedSequence
+
+                def __init__(self, data, **kwargs):
+                    return TypedSequence.__init__(
+                        self,
+                        data,
+                        type=kwargs.pop("type", None),
+                        try_type=kwargs.pop("try_type", None),
+                        optimized_int_type=kwargs.pop("optimized_int_type", None),
+                    )
+
+                OptimizedTypedSequence.__init__ = __init__
+            preprocess_func = partial(
+                preprocess_packed_supervised_dataset,
+                template=template,
+                tokenizer=tokenizer,
+                data_args=data_args,
+            )
+        else:
+            preprocess_func = partial(
+                preprocess_supervised_dataset,
+                template=template,
+                tokenizer=tokenizer,
+                processor=processor,
+                data_args=data_args,
+            )
+
+        print_function = partial(print_supervised_dataset_example, tokenizer=tokenizer)
+    elif stage == "rm":
+        preprocess_func = partial(
+            preprocess_pairwise_dataset,
+            template=template,
+            tokenizer=tokenizer,
+            processor=processor,
+            data_args=data_args,
+        )
+        print_function = partial(print_pairwise_dataset_example, tokenizer=tokenizer)
+    elif stage == "kto":
+        preprocess_func = partial(
+            preprocess_feedback_dataset,
+            template=template,
+            tokenizer=tokenizer,
+            processor=processor,
+            data_args=data_args,
+        )
+        print_function = partial(print_supervised_dataset_example, tokenizer=tokenizer)
+    else:
+        preprocess_func = partial(
+            preprocess_unsupervised_dataset,
+            template=template,
+            tokenizer=tokenizer,
+            processor=processor,
+            data_args=data_args,
+        )
+        print_function = partial(print_unsupervised_dataset_example, tokenizer=tokenizer)
+
+    return preprocess_func, print_function
--- a/pretrain/LLaMA-Factory/src/llamafactory/data/processors/__init__.py
+++ b/pretrain/LLaMA-Factory/src/llamafactory/data/processors/__init__.py
--- a/pretrain/LLaMA-Factory/src/llamafactory/data/processors/feedback.py
+++ b/pretrain/LLaMA-Factory/src/llamafactory/data/processors/feedback.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
+
+from ...extras.constants import IGNORE_INDEX
+from ...extras.logging import get_logger
+from .processor_utils import infer_seqlen
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer, ProcessorMixin
+
+    from ...hparams import DataArguments
+    from ..template import Template
+
+
+logger = get_logger(__name__)
+
+
+def _encode_feedback_example(
+    prompt: Sequence[Dict[str, str]],
+    response: Sequence[Dict[str, str]],
+    kl_response: Sequence[Dict[str, str]],
+    system: Optional[str],
+    tools: Optional[str],
+    template: "Template",
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"],
+    cutoff_len: int,
+) -> Tuple[List[int], List[int], List[int], List[int], bool]:
+    if response[0]["content"]:  # desired example
+        kto_tag = True
+        messages = prompt + [response[0]]
+    else:  # undesired example
+        kto_tag = False
+        messages = prompt + [response[1]]
+
+    if kl_response[0]["content"]:
+        kl_messages = prompt + [kl_response[0]]
+    else:
+        kl_messages = prompt + [kl_response[1]]
+
+    prompt_ids, response_ids = template.encode_oneturn(tokenizer, messages, system, tools)
+    kl_prompt_ids, kl_response_ids = template.encode_oneturn(tokenizer, kl_messages, system, tools)
+
+    if template.efficient_eos:
+        response_ids += [tokenizer.eos_token_id]
+        kl_response_ids += [tokenizer.eos_token_id]
+
+    prompt_ids, _ = template.mm_plugin.process_token_ids(prompt_ids, None, tokenizer, processor)
+    kl_prompt_ids, _ = template.mm_plugin.process_token_ids(kl_prompt_ids, None, tokenizer, processor)
+
+    source_len, target_len = infer_seqlen(len(prompt_ids), len(response_ids), cutoff_len)
+    prompt_ids = prompt_ids[:source_len]
+    response_ids = response_ids[:target_len]
+    kl_source_len, kl_target_len = infer_seqlen(len(kl_prompt_ids), len(kl_response_ids), cutoff_len)
+    kl_prompt_ids = kl_prompt_ids[:kl_source_len]
+    kl_response_ids = kl_response_ids[:kl_target_len]
+
+    input_ids = prompt_ids + response_ids
+    labels = [IGNORE_INDEX] * source_len + response_ids
+    kl_input_ids = kl_prompt_ids + kl_response_ids
+    kl_labels = [IGNORE_INDEX] * kl_source_len + kl_response_ids
+
+    return input_ids, labels, kl_input_ids, kl_labels, kto_tag
+
+
+def preprocess_feedback_dataset(
+    examples: Dict[str, List[Any]],
+    template: "Template",
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"],
+    data_args: "DataArguments",
+) -> Dict[str, List[Any]]:
+    # create unrelated input-output pairs for estimating the KL term by flipping the matched pairs
+    kl_response = examples["response"][::-1]
+    model_inputs = defaultdict(list)
+    for i in range(len(examples["prompt"])):
+        if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) < 2:
+            logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i]))
+            continue
+
+        prompt = template.mm_plugin.process_messages(examples["prompt"][i], examples["images"][i], processor)
+        input_ids, labels, kl_input_ids, kl_labels, kto_tag = _encode_feedback_example(
+            prompt=prompt,
+            response=examples["response"][i],
+            kl_response=kl_response[i],
+            system=examples["system"][i],
+            tools=examples["tools"][i],
+            template=template,
+            tokenizer=tokenizer,
+            processor=processor,
+            cutoff_len=data_args.cutoff_len,
+        )
+        model_inputs["input_ids"].append(input_ids)
+        model_inputs["attention_mask"].append([1] * len(input_ids))
+        model_inputs["labels"].append(labels)
+        model_inputs["kl_input_ids"].append(kl_input_ids)
+        model_inputs["kl_attention_mask"].append([1] * len(kl_input_ids))
+        model_inputs["kl_labels"].append(kl_labels)
+        model_inputs["kto_tags"].append(kto_tag)
+        template.mm_plugin.process_model_inputs(
+            model_inputs=model_inputs,
+            images=examples["images"][i],
+            feature_seqlens={
+                "token_type_ids": len(input_ids),
+                "kl_token_type_ids": len(kl_input_ids),
+            },
+            processor=processor,
+        )
+
+    desirable_num = sum([1 for tag in model_inputs["kto_tags"] if tag])
+    undesirable_num = len(model_inputs["kto_tags"]) - desirable_num
+    if desirable_num == 0 or undesirable_num == 0:
+        logger.warning("Your dataset only has one preference type.")
+
+    return model_inputs
--- a/pretrain/LLaMA-Factory/src/llamafactory/data/processors/pairwise.py
+++ b/pretrain/LLaMA-Factory/src/llamafactory/data/processors/pairwise.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
+
+from ...extras.constants import IGNORE_INDEX
+from ...extras.logging import get_logger
+from .processor_utils import infer_seqlen
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer, ProcessorMixin
+
+    from ...hparams import DataArguments
+    from ..template import Template
+
+
+logger = get_logger(__name__)
+
+
+def _encode_pairwise_example(
+    prompt: Sequence[Dict[str, str]],
+    response: Sequence[Dict[str, str]],
+    system: Optional[str],
+    tools: Optional[str],
+    template: "Template",
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"],
+    cutoff_len: int,
+) -> Tuple[List[int], List[int], List[int], List[int]]:
+    chosen_messages = prompt + [response[0]]
+    rejected_messages = prompt + [response[1]]
+    prompt_ids, chosen_ids = template.encode_oneturn(tokenizer, chosen_messages, system, tools)
+    _, rejected_ids = template.encode_oneturn(tokenizer, rejected_messages, system, tools)
+
+    if template.efficient_eos:
+        chosen_ids += [tokenizer.eos_token_id]
+        rejected_ids += [tokenizer.eos_token_id]
+
+    prompt_ids, _ = template.mm_plugin.process_token_ids(prompt_ids, None, tokenizer, processor)
+    # consider the response is more important
+    source_len, target_len = infer_seqlen(len(prompt_ids), max(len(chosen_ids), len(rejected_ids)), cutoff_len)
+    prompt_ids = prompt_ids[:source_len]
+    chosen_ids = chosen_ids[:target_len]
+    rejected_ids = rejected_ids[:target_len]
+
+    chosen_input_ids = prompt_ids + chosen_ids
+    chosen_labels = [IGNORE_INDEX] * source_len + chosen_ids
+    rejected_input_ids = prompt_ids + rejected_ids
+    rejected_labels = [IGNORE_INDEX] * source_len + rejected_ids
+
+    return chosen_input_ids, chosen_labels, rejected_input_ids, rejected_labels
+
+
+def preprocess_pairwise_dataset(
+    examples: Dict[str, List[Any]],
+    template: "Template",
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"],
+    data_args: "DataArguments",
+) -> Dict[str, List[Any]]:
+    # build input pairs with format `<bos> X`, `Y1 <eos>` and `Y2 <eos>`
+    model_inputs = defaultdict(list)
+    for i in range(len(examples["prompt"])):
+        if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) < 2:
+            logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i]))
+            continue
+
+        prompt = template.mm_plugin.process_messages(examples["prompt"][i], examples["images"][i], processor)
+        chosen_input_ids, chosen_labels, rejected_input_ids, rejected_labels = _encode_pairwise_example(
+            prompt=prompt,
+            response=examples["response"][i],
+            system=examples["system"][i],
+            tools=examples["tools"][i],
+            template=template,
+            tokenizer=tokenizer,
+            processor=processor,
+            cutoff_len=data_args.cutoff_len,
+        )
+        model_inputs["chosen_input_ids"].append(chosen_input_ids)
+        model_inputs["chosen_attention_mask"].append([1] * len(chosen_input_ids))
+        model_inputs["chosen_labels"].append(chosen_labels)
+        model_inputs["rejected_input_ids"].append(rejected_input_ids)
+        model_inputs["rejected_attention_mask"].append([1] * len(rejected_input_ids))
+        model_inputs["rejected_labels"].append(rejected_labels)
+        template.mm_plugin.process_model_inputs(
+            model_inputs=model_inputs,
+            images=examples["images"][i],
+            feature_seqlens={
+                "chosen_token_type_ids": len(chosen_input_ids),
+                "rejected_token_type_ids": len(rejected_input_ids),
+            },
+            processor=processor,
+        )
+
+    return model_inputs
+
+
+def print_pairwise_dataset_example(example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer") -> None:
+    valid_chosen_labels = list(filter(lambda x: x != IGNORE_INDEX, example["chosen_labels"]))
+    valid_rejected_labels = list(filter(lambda x: x != IGNORE_INDEX, example["rejected_labels"]))
+    print("chosen_input_ids:\n{}".format(example["chosen_input_ids"]))
+    print("chosen_inputs:\n{}".format(tokenizer.decode(example["chosen_input_ids"], skip_special_tokens=False)))
+    print("chosen_label_ids:\n{}".format(example["chosen_labels"]))
+    print("chosen_labels:\n{}".format(tokenizer.decode(valid_chosen_labels, skip_special_tokens=False)))
+    print("rejected_input_ids:\n{}".format(example["rejected_input_ids"]))
+    print("rejected_inputs:\n{}".format(tokenizer.decode(example["rejected_input_ids"], skip_special_tokens=False)))
+    print("rejected_label_ids:\n{}".format(example["rejected_labels"]))
+    print("rejected_labels:\n{}".format(tokenizer.decode(valid_rejected_labels, skip_special_tokens=False)))
--- a/pretrain/LLaMA-Factory/src/llamafactory/data/processors/pretrain.py
+++ b/pretrain/LLaMA-Factory/src/llamafactory/data/processors/pretrain.py
+# Copyright 2024 HuggingFace Inc. and the LlamaFactory team.
+#
+# This code is inspired by the HuggingFace's transformers library.
+# https://github.com/huggingface/transformers/blob/v4.40.0/examples/pytorch/language-modeling/run_clm.py
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from itertools import chain
+from typing import TYPE_CHECKING, Any, Dict, List
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+
+    from ...hparams import DataArguments
+
+
+def preprocess_pretrain_dataset(
+    examples: Dict[str, List[Any]], tokenizer: "PreTrainedTokenizer", data_args: "DataArguments"
+) -> Dict[str, List[Any]]:
+    # build grouped texts with format `X1 X2 X3 ...` if packing is enabled
+    eos_token = "<|end_of_text|>" if data_args.template == "llama3" else tokenizer.eos_token
+    text_examples = [messages[0]["content"] + eos_token for messages in examples["prompt"]]
+
+    if not data_args.packing:
+        if data_args.template == "gemma":
+            text_examples = [tokenizer.bos_token + example for example in text_examples]
+
+        result = tokenizer(text_examples, add_special_tokens=False, max_length=data_args.cutoff_len, truncation=True)
+    else:
+        tokenized_examples = tokenizer(text_examples, add_special_tokens=False)
+        concatenated_examples = {k: list(chain(*tokenized_examples[k])) for k in tokenized_examples.keys()}
+        total_length = len(concatenated_examples[list(concatenated_examples.keys())[0]])
+        block_size = data_args.cutoff_len
+        total_length = (total_length // block_size) * block_size
+        result = {
+            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+            for k, t in concatenated_examples.items()
+        }
+        if data_args.template == "gemma":
+            for i in range(len(result["input_ids"])):
+                result["input_ids"][i][0] = tokenizer.bos_token_id
+
+    return result
--- a/pretrain/LLaMA-Factory/src/llamafactory/data/processors/processor_utils.py
+++ b/pretrain/LLaMA-Factory/src/llamafactory/data/processors/processor_utils.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bisect
+from typing import List, Sequence, Tuple
+
+
+def search_for_fit(numbers: Sequence[int], capacity: int) -> int:
+    r"""
+    Finds the index of largest number that fits into the knapsack with the given capacity.
+    """
+    index = bisect.bisect(numbers, capacity)
+    return -1 if index == 0 else (index - 1)
+
+
+def greedy_knapsack(numbers: List[int], capacity: int) -> List[List[int]]:
+    r"""
+    An efficient greedy algorithm with binary search for the knapsack problem.
+    """
+    numbers.sort()  # sort numbers in ascending order for binary search
+    knapsacks = []
+
+    while numbers:
+        current_knapsack = []
+        remaining_capacity = capacity
+
+        while True:
+            index = search_for_fit(numbers, remaining_capacity)
+            if index == -1:
+                break  # no more numbers fit in this knapsack
+
+            remaining_capacity -= numbers[index]  # update the remaining capacity
+            current_knapsack.append(numbers.pop(index))  # add the number to knapsack
+
+        knapsacks.append(current_knapsack)
+
+    return knapsacks
+
+
+def infer_seqlen(source_len: int, target_len: int, cutoff_len: int) -> Tuple[int, int]:
+    r"""
+    Computes the real sequence length after truncation by the cutoff_len.
+    """
+    if target_len * 2 < cutoff_len:  # truncate source
+        max_target_len = cutoff_len
+    elif source_len * 2 < cutoff_len:  # truncate target
+        max_target_len = cutoff_len - source_len
+    else:  # truncate both
+        max_target_len = int(cutoff_len * (target_len / (source_len + target_len)))
+
+    new_target_len = min(max_target_len, target_len)
+    max_source_len = max(cutoff_len - new_target_len, 0)
+    new_source_len = min(max_source_len, source_len)
+    return new_source_len, new_target_len
--- a/pretrain/LLaMA-Factory/src/llamafactory/data/processors/supervised.py
+++ b/pretrain/LLaMA-Factory/src/llamafactory/data/processors/supervised.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
+
+from ...extras.constants import IGNORE_INDEX
+from ...extras.logging import get_logger
+from .processor_utils import greedy_knapsack, infer_seqlen
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer, ProcessorMixin
+
+    from ...hparams import DataArguments
+    from ..template import Template
+
+
+logger = get_logger(__name__)
+
+
+def _encode_supervised_example(
+    prompt: Sequence[Dict[str, str]],
+    response: Sequence[Dict[str, str]],
+    system: Optional[str],
+    tools: Optional[str],
+    template: "Template",
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"],
+    cutoff_len: int,
+    train_on_prompt: bool,
+    mask_history: bool,
+) -> Tuple[List[int], List[int]]:
+    messages = prompt + response
+    input_ids, labels = [], []
+    input_ids, labels = template.mm_plugin.process_token_ids(input_ids, labels, tokenizer, processor)
+
+    encoded_pairs = template.encode_multiturn(tokenizer, messages, system, tools)
+    total_length = 1 if template.efficient_eos else 0
+    if mask_history:
+        encoded_pairs = encoded_pairs[::-1]  # high priority for last turns
+
+    for turn_idx, (source_ids, target_ids) in enumerate(encoded_pairs):
+        if total_length >= cutoff_len:
+            break
+
+        source_len, target_len = infer_seqlen(len(source_ids), len(target_ids), cutoff_len - total_length)
+        source_ids = source_ids[:source_len]
+        target_ids = target_ids[:target_len]
+        total_length += source_len + target_len
+
+        if train_on_prompt:
+            source_label = source_ids
+        elif template.efficient_eos:
+            source_label = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (source_len - 1)
+        else:
+            source_label = [IGNORE_INDEX] * source_len
+
+        if mask_history and turn_idx != 0:  # train on the last turn only
+            target_label = [IGNORE_INDEX] * target_len
+        else:
+            target_label = target_ids
+
+        if mask_history:  # reversed sequences
+            input_ids = source_ids + target_ids + input_ids
+            labels = source_label + target_label + labels
+        else:
+            input_ids += source_ids + target_ids
+            labels += source_label + target_label
+
+    if template.efficient_eos:
+        input_ids += [tokenizer.eos_token_id]
+        labels += [tokenizer.eos_token_id]
+
+    return input_ids, labels
+
+
+def preprocess_supervised_dataset(
+    examples: Dict[str, List[Any]],
+    template: "Template",
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"],
+    data_args: "DataArguments",
+) -> Dict[str, List[Any]]:
+    # build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>`
+    # for multiturn examples, we only mask the prompt part in each prompt-response pair.
+    model_inputs = defaultdict(list)
+    for i in range(len(examples["prompt"])):
+        if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1:
+            logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i]))
+            continue
+
+        prompt = template.mm_plugin.process_messages(examples["prompt"][i], examples["images"][i], processor)
+        input_ids, labels = _encode_supervised_example(
+            prompt=prompt,
+            response=examples["response"][i],
+            system=examples["system"][i],
+            tools=examples["tools"][i],
+            template=template,
+            tokenizer=tokenizer,
+            processor=processor,
+            cutoff_len=data_args.cutoff_len,
+            train_on_prompt=data_args.train_on_prompt,
+            mask_history=data_args.mask_history,
+        )
+        model_inputs["input_ids"].append(input_ids)
+        model_inputs["attention_mask"].append([1] * len(input_ids))
+        model_inputs["labels"].append(labels)
+        template.mm_plugin.process_model_inputs(
+            model_inputs=model_inputs,
+            images=examples["images"][i],
+            feature_seqlens={"token_type_ids": len(input_ids)},
+            processor=processor,
+        )
+
+    return model_inputs
+
+
+def preprocess_packed_supervised_dataset(
+    examples: Dict[str, List[Any]],
+    template: "Template",
+    tokenizer: "PreTrainedTokenizer",
+    data_args: "DataArguments",
+) -> Dict[str, List[Any]]:
+    # build inputs with format `<bos> X1 Y1 <eos> <bos> X2 Y2 <eos>`
+    # and labels with format `<ignore> ... <ignore> Y1 <eos> <ignore> ... <ignore> Y2 <eos>`
+    valid_num = 0
+    batch_input_ids, batch_labels = [], []
+    lengths = []
+    length2indexes = defaultdict(list)
+    for i in range(len(examples["prompt"])):
+        if len(examples["prompt"][i]) % 2 != 1 or len(examples["response"][i]) != 1:
+            logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i]))
+            continue
+
+        input_ids, labels = _encode_supervised_example(
+            prompt=examples["prompt"][i],
+            response=examples["response"][i],
+            system=examples["system"][i],
+            tools=examples["tools"][i],
+            template=template,
+            tokenizer=tokenizer,
+            processor=None,
+            cutoff_len=data_args.cutoff_len - 1,  # reserved for the padding token
+            train_on_prompt=data_args.train_on_prompt,
+            mask_history=data_args.mask_history,
+        )
+        length = len(input_ids)
+        if length > data_args.cutoff_len:
+            logger.warning("Dropped lengthy example with length {} > {}.".format(length, data_args.cutoff_len))
+        else:
+            lengths.append(length)
+            length2indexes[length].append(valid_num)
+            batch_input_ids.append(input_ids)
+            batch_labels.append(labels)
+            valid_num += 1
+
+    model_inputs = defaultdict(list)
+    knapsacks = greedy_knapsack(lengths, data_args.cutoff_len - 1)  # reserved for the padding token
+    for knapsack in knapsacks:
+        packed_input_ids, packed_attention_masks, packed_labels = [], [], []
+        for i, length in enumerate(knapsack):
+            index = length2indexes[length].pop()
+            packed_input_ids += batch_input_ids[index]
+            packed_labels += batch_labels[index]
+            if data_args.neat_packing:
+                packed_attention_masks += [i + 1] * len(batch_input_ids[index])  # start from 1
+            else:
+                packed_attention_masks += [1] * len(batch_input_ids[index])
+
+        if len(packed_input_ids) < data_args.cutoff_len:
+            pad_length = data_args.cutoff_len - len(packed_input_ids)
+            packed_input_ids += [tokenizer.pad_token_id] * pad_length
+            packed_labels += [IGNORE_INDEX] * pad_length
+            if data_args.neat_packing:
+                packed_attention_masks += [0] * pad_length
+            else:
+                packed_attention_masks += [1] * pad_length  # more efficient flash_attn
+
+        if len(packed_input_ids) != data_args.cutoff_len:
+            raise ValueError("The length of packed example should be identical to the cutoff length.")
+
+        model_inputs["input_ids"].append(packed_input_ids)
+        model_inputs["attention_mask"].append(packed_attention_masks)
+        model_inputs["labels"].append(packed_labels)
+
+    return model_inputs
+
+
+def print_supervised_dataset_example(example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer") -> None:
+    valid_labels = list(filter(lambda x: x != IGNORE_INDEX, example["labels"]))
+    print("input_ids:\n{}".format(example["input_ids"]))
+    print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
+    print("label_ids:\n{}".format(example["labels"]))
+    print("labels:\n{}".format(tokenizer.decode(valid_labels, skip_special_tokens=False)))
--- a/pretrain/LLaMA-Factory/src/llamafactory/data/processors/unsupervised.py
+++ b/pretrain/LLaMA-Factory/src/llamafactory/data/processors/unsupervised.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from collections import defaultdict
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Tuple
+
+from ...extras.logging import get_logger
+from ..data_utils import Role
+from .processor_utils import infer_seqlen
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer, ProcessorMixin
+
+    from ...hparams import DataArguments
+    from ..template import Template
+
+
+logger = get_logger(__name__)
+
+
+def _encode_unsupervised_example(
+    prompt: Sequence[Dict[str, str]],
+    response: Sequence[Dict[str, str]],
+    system: Optional[str],
+    tools: Optional[str],
+    template: "Template",
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"],
+    cutoff_len: int,
+) -> Tuple[List[int], List[int]]:
+    if len(response) == 1:
+        messages = prompt + response
+    else:
+        messages = prompt + [{"role": Role.ASSISTANT.value, "content": ""}]
+
+    input_ids, labels = template.encode_oneturn(tokenizer, messages, system, tools)
+    if template.efficient_eos:
+        labels += [tokenizer.eos_token_id]
+
+    input_ids, _ = template.mm_plugin.process_token_ids(input_ids, None, tokenizer, processor)
+    source_len, target_len = infer_seqlen(len(input_ids), len(labels), cutoff_len)
+    input_ids = input_ids[:source_len]
+    labels = labels[:target_len]
+    return input_ids, labels
+
+
+def preprocess_unsupervised_dataset(
+    examples: Dict[str, List[Any]],
+    template: "Template",
+    tokenizer: "PreTrainedTokenizer",
+    processor: Optional["ProcessorMixin"],
+    data_args: "DataArguments",
+) -> Dict[str, List[Any]]:
+    # build inputs with format `<bos> X` and labels with format `Y <eos>`
+    model_inputs = defaultdict(list)
+    for i in range(len(examples["prompt"])):
+        if len(examples["prompt"][i]) % 2 != 1:
+            logger.warning("Dropped invalid example: {}".format(examples["prompt"][i] + examples["response"][i]))
+            continue
+
+        prompt = template.mm_plugin.process_messages(examples["prompt"][i], examples["images"][i], processor)
+        input_ids, labels = _encode_unsupervised_example(
+            prompt=prompt,
+            response=examples["response"][i],
+            system=examples["system"][i],
+            tools=examples["tools"][i],
+            template=template,
+            tokenizer=tokenizer,
+            processor=processor,
+            cutoff_len=data_args.cutoff_len,
+        )
+        model_inputs["input_ids"].append(input_ids)
+        model_inputs["attention_mask"].append([1] * len(input_ids))
+        model_inputs["labels"].append(labels)
+        template.mm_plugin.process_model_inputs(
+            model_inputs=model_inputs,
+            images=examples["images"][i],
+            feature_seqlens={"token_type_ids": len(input_ids)},
+            processor=processor,
+        )
+
+    return model_inputs
+
+
+def print_unsupervised_dataset_example(example: Dict[str, List[int]], tokenizer: "PreTrainedTokenizer") -> None:
+    print("input_ids:\n{}".format(example["input_ids"]))
+    print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
--- a/pretrain/LLaMA-Factory/src/llamafactory/data/template.py
+++ b/pretrain/LLaMA-Factory/src/llamafactory/data/template.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Tuple, Union
+
+from ..extras.constants import IMAGE_PLACEHOLDER
+from ..extras.logging import get_logger
+from .data_utils import Role
+from .formatter import EmptyFormatter, FunctionFormatter, StringFormatter, ToolFormatter
+from .mm_plugin import BasePlugin, get_mm_plugin
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+
+    from .formatter import SLOTS, Formatter
+
+
+logger = get_logger(__name__)
+
+
+@dataclass
+class Template:
+    format_user: "Formatter"
+    format_assistant: "Formatter"
+    format_system: "Formatter"
+    format_function: "Formatter"
+    format_observation: "Formatter"
+    format_tools: "Formatter"
+    format_separator: "Formatter"
+    format_prefix: "Formatter"
+    default_system: str
+    stop_words: List[str]
+    efficient_eos: bool
+    replace_eos: bool
+    mm_plugin: "BasePlugin"
+
+    def encode_oneturn(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: Sequence[Dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+    ) -> Tuple[List[int], List[int]]:
+        r"""
+        Returns a single pair of token ids representing prompt and response respectively.
+        """
+        encoded_messages = self._encode(tokenizer, messages, system, tools)
+        prompt_ids = []
+        for encoded_ids in encoded_messages[:-1]:
+            prompt_ids += encoded_ids
+
+        answer_ids = encoded_messages[-1]
+        return prompt_ids, answer_ids
+
+    def encode_multiturn(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: Sequence[Dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+    ) -> List[Tuple[List[int], List[int]]]:
+        r"""
+        Returns multiple pairs of token ids representing prompts and responses respectively.
+        """
+        encoded_messages = self._encode(tokenizer, messages, system, tools)
+        return [(encoded_messages[i], encoded_messages[i + 1]) for i in range(0, len(encoded_messages), 2)]
+
+    def extract_tool(self, content: str) -> Union[str, List[Tuple[str, str]]]:
+        r"""
+        Extracts tool message.
+        """
+        return self.format_tools.extract(content)
+
+    def _encode(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: Sequence[Dict[str, str]],
+        system: Optional[str],
+        tools: Optional[str],
+    ) -> List[List[int]]:
+        r"""
+        Encodes formatted inputs to pairs of token ids.
+        Turn 0: prefix + system + query        resp
+        Turn t: sep + query                    resp
+        """
+        system = system or self.default_system
+        encoded_messages = []
+        for i, message in enumerate(messages):
+            elements = []
+
+            if i == 0:
+                elements += self.format_prefix.apply()
+                if system or tools:
+                    tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
+                    elements += self.format_system.apply(content=(system + tool_text))
+
+            if i > 0 and i % 2 == 0:
+                elements += self.format_separator.apply()
+
+            if message["role"] == Role.USER.value:
+                elements += self.format_user.apply(content=message["content"], idx=str(i // 2))
+            elif message["role"] == Role.ASSISTANT.value:
+                elements += self.format_assistant.apply(content=message["content"])
+            elif message["role"] == Role.OBSERVATION.value:
+                elements += self.format_observation.apply(content=message["content"])
+            elif message["role"] == Role.FUNCTION.value:
+                elements += self.format_function.apply(content=message["content"])
+            else:
+                raise NotImplementedError("Unexpected role: {}".format(message["role"]))
+
+            encoded_messages.append(self._convert_elements_to_ids(tokenizer, elements))
+
+        return encoded_messages
+
+    def _convert_elements_to_ids(self, tokenizer: "PreTrainedTokenizer", elements: "SLOTS") -> List[int]:
+        r"""
+        Converts elements to token ids.
+        """
+        token_ids = []
+        for elem in elements:
+            if isinstance(elem, str):
+                if len(elem) != 0:
+                    token_ids += tokenizer.encode(elem, add_special_tokens=False)
+            elif isinstance(elem, dict):
+                token_ids += [tokenizer.convert_tokens_to_ids(elem.get("token"))]
+            elif isinstance(elem, set):
+                if "bos_token" in elem and tokenizer.bos_token_id is not None:
+                    token_ids += [tokenizer.bos_token_id]
+                elif "eos_token" in elem and tokenizer.eos_token_id is not None:
+                    token_ids += [tokenizer.eos_token_id]
+            else:
+                raise ValueError("Input must be string, set[str] or dict[str, str], got {}".format(type(elem)))
+
+        return token_ids
+
+
+@dataclass
+class Llama2Template(Template):
+    def _encode(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: Sequence[Dict[str, str]],
+        system: str,
+        tools: str,
+    ) -> List[List[int]]:
+        r"""
+        Encodes formatted inputs to pairs of token ids.
+        Turn 0: prefix + system + query        resp
+        Turn t: sep + query                    resp
+        """
+        system = system or self.default_system
+        encoded_messages = []
+        for i, message in enumerate(messages):
+            elements = []
+
+            system_text = ""
+            if i == 0:
+                elements += self.format_prefix.apply()
+                if system or tools:
+                    tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
+                    system_text = self.format_system.apply(content=(system + tool_text))[0]
+
+            if i > 0 and i % 2 == 0:
+                elements += self.format_separator.apply()
+
+            if message["role"] == Role.USER.value:
+                elements += self.format_user.apply(content=system_text + message["content"])
+            elif message["role"] == Role.ASSISTANT.value:
+                elements += self.format_assistant.apply(content=message["content"])
+            elif message["role"] == Role.OBSERVATION.value:
+                elements += self.format_observation.apply(content=message["content"])
+            elif message["role"] == Role.FUNCTION.value:
+                elements += self.format_function.apply(content=message["content"])
+            else:
+                raise NotImplementedError("Unexpected role: {}".format(message["role"]))
+
+            encoded_messages.append(self._convert_elements_to_ids(tokenizer, elements))
+
+        return encoded_messages
+
+
+TEMPLATES: Dict[str, Template] = {}
+
+
+def _register_template(
+    name: str,
+    format_user: Optional["Formatter"] = None,
+    format_assistant: Optional["Formatter"] = None,
+    format_system: Optional["Formatter"] = None,
+    format_function: Optional["Formatter"] = None,
+    format_observation: Optional["Formatter"] = None,
+    format_tools: Optional["Formatter"] = None,
+    format_separator: Optional["Formatter"] = None,
+    format_prefix: Optional["Formatter"] = None,
+    default_system: str = "",
+    stop_words: Sequence[str] = [],
+    efficient_eos: bool = False,
+    replace_eos: bool = False,
+    mm_plugin: "BasePlugin" = BasePlugin(IMAGE_PLACEHOLDER),
+) -> None:
+    r"""
+    Registers a chat template.
+
+    To add the following chat template:
+    ```
+    [HUMAN]:
+    user prompt here
+    [AI]:
+    model response here
+
+    [HUMAN]:
+    user prompt here
+    [AI]:
+    model response here
+    ```
+
+    The corresponding code should be:
+    ```
+    _register_template(
+        name="custom",
+        format_user=StringFormatter(slots=["[HUMAN]:\n{{content}}\n[AI]:\n"]),
+        format_separator=EmptyFormatter(slots=["\n\n"]),
+        efficient_eos=True,
+    )
+    ```
+    """
+    eos_slots = [] if efficient_eos else [{"eos_token"}]
+    template_class = Llama2Template if name.startswith("llama2") else Template
+    default_user_formatter = StringFormatter(slots=["{{content}}"])
+    default_assistant_formatter = StringFormatter(slots=["{{content}}"] + eos_slots)
+    default_function_formatter = FunctionFormatter(slots=eos_slots, tool_format="default")
+    default_tool_formatter = ToolFormatter(tool_format="default")
+    default_separator_formatter = EmptyFormatter()
+    default_prefix_formatter = EmptyFormatter()
+    TEMPLATES[name] = template_class(
+        format_user=format_user or default_user_formatter,
+        format_assistant=format_assistant or default_assistant_formatter,
+        format_system=format_system or default_user_formatter,
+        format_function=format_function or default_function_formatter,
+        format_observation=format_observation or format_user or default_user_formatter,
+        format_tools=format_tools or default_tool_formatter,
+        format_separator=format_separator or default_separator_formatter,
+        format_prefix=format_prefix or default_prefix_formatter,
+        default_system=default_system,
+        stop_words=stop_words,
+        efficient_eos=efficient_eos,
+        replace_eos=replace_eos,
+        mm_plugin=mm_plugin,
+    )
+
+
+def _add_or_replace_eos_token(tokenizer: "PreTrainedTokenizer", eos_token: str) -> None:
+    is_added = tokenizer.eos_token_id is None
+    num_added_tokens = tokenizer.add_special_tokens({"eos_token": eos_token})
+
+    if is_added:
+        logger.info("Add eos token: {}".format(tokenizer.eos_token))
+    else:
+        logger.info("Replace eos token: {}".format(tokenizer.eos_token))
+
+    if num_added_tokens > 0:
+        logger.warning("New tokens have been added, make sure `resize_vocab` is True.")
+
+
+def _jinja_escape(content: str) -> str:
+    return content.replace("'", r"\'")
+
+
+def _convert_slots_to_jinja(slots: "SLOTS", tokenizer: "PreTrainedTokenizer", placeholder: str = "content") -> str:
+    slot_items = []
+    for slot in slots:
+        if isinstance(slot, str):
+            slot_pieces = slot.split("{{content}}")
+            if slot_pieces[0]:
+                slot_items.append("'" + _jinja_escape(slot_pieces[0]) + "'")
+            if len(slot_pieces) > 1:
+                slot_items.append(placeholder)
+                if slot_pieces[1]:
+                    slot_items.append("'" + _jinja_escape(slot_pieces[1]) + "'")
+        elif isinstance(slot, set):  # do not use {{ eos_token }} since it may be replaced
+            if "bos_token" in slot and tokenizer.bos_token_id is not None:
+                slot_items.append("'" + tokenizer.bos_token + "'")
+            elif "eos_token" in slot and tokenizer.eos_token_id is not None:
+                slot_items.append("'" + tokenizer.eos_token + "'")
+        elif isinstance(slot, dict):
+            raise ValueError("Dict is not supported.")
+
+    return " + ".join(slot_items)
+
+
+def _get_jinja_template(template: "Template", tokenizer: "PreTrainedTokenizer") -> str:
+    jinja_template = ""
+
+    prefix = _convert_slots_to_jinja(template.format_prefix.apply(), tokenizer)
+    if prefix:
+        jinja_template += "{{ " + prefix + " }}"
+
+    if template.default_system:
+        jinja_template += "{% set system_message = '" + _jinja_escape(template.default_system) + "' %}"
+
+    jinja_template += (
+        "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}"
+        "{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}"
+    )
+
+    system_message = _convert_slots_to_jinja(template.format_system.apply(), tokenizer, placeholder="system_message")
+    if not isinstance(template, Llama2Template):
+        jinja_template += "{% if system_message is defined %}{{ " + system_message + " }}{% endif %}"
+
+    jinja_template += "{% for message in loop_messages %}"
+    jinja_template += "{% set content = message['content'] %}"
+    if isinstance(template, Llama2Template):
+        jinja_template += "{% if loop.index0 == 0 and system_message is defined %}"
+        jinja_template += "{% set content = " + system_message + " + message['content'] %}"
+        jinja_template += "{% endif %}"
+
+    jinja_template += "{% if message['role'] == 'user' %}"
+    user_message = _convert_slots_to_jinja(template.format_user.apply(), tokenizer)
+    jinja_template += "{{ " + user_message + " }}"
+
+    jinja_template += "{% elif message['role'] == 'assistant' %}"
+    assistant_message = _convert_slots_to_jinja(
+        template.format_assistant.apply() + template.format_separator.apply(), tokenizer
+    )
+    jinja_template += "{{ " + assistant_message + " }}"
+    jinja_template += "{% endif %}"
+    jinja_template += "{% endfor %}"
+    return jinja_template
+
+
+def get_template_and_fix_tokenizer(
+    tokenizer: "PreTrainedTokenizer",
+    name: Optional[str] = None,
+    tool_format: Optional[str] = None,
+) -> Template:
+    if name is None:
+        template = TEMPLATES["empty"]  # placeholder
+    else:
+        template = TEMPLATES.get(name, None)
+        if template is None:
+            raise ValueError("Template {} does not exist.".format(name))
+
+    if tool_format is not None:
+        logger.info("Using tool format: {}.".format(tool_format))
+        eos_slots = [] if template.efficient_eos else [{"eos_token"}]
+        template.format_tools = ToolFormatter(tool_format=tool_format)
+        template.format_function = FunctionFormatter(slots=eos_slots, tool_format=tool_format)
+
+    stop_words = template.stop_words
+    if template.replace_eos:
+        if not stop_words:
+            raise ValueError("Stop words are required to replace the EOS token.")
+
+        _add_or_replace_eos_token(tokenizer, eos_token=stop_words[0])
+        stop_words = stop_words[1:]
+
+    if tokenizer.eos_token_id is None:
+        _add_or_replace_eos_token(tokenizer, eos_token="<|endoftext|>")
+
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token = tokenizer.eos_token
+        logger.info("Add pad token: {}".format(tokenizer.pad_token))
+
+    if stop_words:
+        num_added_tokens = tokenizer.add_special_tokens(
+            dict(additional_special_tokens=stop_words), replace_additional_special_tokens=False
+        )
+        logger.info("Add {} to stop words.".format(",".join(stop_words)))
+        if num_added_tokens > 0:
+            logger.warning("New tokens have been added, make sure `resize_vocab` is True.")
+
+    try:
+        tokenizer.chat_template = _get_jinja_template(template, tokenizer)
+    except ValueError:
+        logger.info("Cannot add this chat template to tokenizer.")
+
+    return template
+
+
+_register_template(
+    name="alpaca",
+    format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n\n### Response:\n"]),
+    format_separator=EmptyFormatter(slots=["\n\n"]),
+    default_system=(
+        "Below is an instruction that describes a task. "
+        "Write a response that appropriately completes the request.\n\n"
+    ),
+)
+
+
+_register_template(
+    name="aquila",
+    format_user=StringFormatter(slots=["Human: {{content}}###Assistant:"]),
+    format_separator=EmptyFormatter(slots=["###"]),
+    default_system=(
+        "A chat between a curious human and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the human's questions."
+    ),
+    stop_words=["</s>"],
+    efficient_eos=True,
+)
+
+
+_register_template(
+    name="atom",
+    format_user=StringFormatter(
+        slots=[{"bos_token"}, "Human: {{content}}\n", {"eos_token"}, {"bos_token"}, "Assistant:"]
+    ),
+    format_assistant=StringFormatter(slots=["{{content}}\n", {"eos_token"}]),
+)
+
+
+_register_template(
+    name="baichuan",
+    format_user=StringFormatter(slots=[{"token": "<reserved_102>"}, "{{content}}", {"token": "<reserved_103>"}]),
+    efficient_eos=True,
+)
+
+
+_register_template(
+    name="baichuan2",
+    format_user=StringFormatter(slots=["<reserved_106>{{content}}<reserved_107>"]),
+    efficient_eos=True,
+)
+
+
+_register_template(
+    name="belle",
+    format_user=StringFormatter(slots=["Human: {{content}}\n\nBelle: "]),
+    format_separator=EmptyFormatter(slots=["\n\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+)
+
+
+_register_template(
+    name="bluelm",
+    format_user=StringFormatter(slots=[{"token": "[|Human|]:"}, "{{content}}", {"token": "[|AI|]:"}]),
+)
+
+
+_register_template(
+    name="breeze",
+    format_user=StringFormatter(slots=["[INST] {{content}} [/INST] "]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    efficient_eos=True,
+)
+
+
+_register_template(
+    name="chatglm2",
+    format_user=StringFormatter(slots=["[Round {{idx}}]\n\n问：{{content}}\n\n答："]),
+    format_separator=EmptyFormatter(slots=["\n\n"]),
+    format_prefix=EmptyFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}]),
+    efficient_eos=True,
+)
+
+
+_register_template(
+    name="chatglm3",
+    format_user=StringFormatter(slots=[{"token": "<|user|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]),
+    format_assistant=StringFormatter(slots=["\n", "{{content}}"]),
+    format_system=StringFormatter(slots=[{"token": "<|system|>"}, "\n", "{{content}}"]),
+    format_function=FunctionFormatter(slots=[], tool_format="glm4"),
+    format_observation=StringFormatter(
+        slots=[{"token": "<|observation|>"}, "\n", "{{content}}", {"token": "<|assistant|>"}]
+    ),
+    format_tools=ToolFormatter(tool_format="glm4"),
+    format_prefix=EmptyFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}]),
+    stop_words=["<|user|>", "<|observation|>"],
+    efficient_eos=True,
+)
+
+
+_register_template(
+    name="chatml",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    stop_words=["<|im_end|>", "<|im_start|>"],
+    replace_eos=True,
+)
+
+
+_register_template(
+    name="chatml_de",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    default_system="Du bist ein freundlicher und hilfsbereiter KI-Assistent.",
+    stop_words=["<|im_end|>", "<|im_start|>"],
+    replace_eos=True,
+)
+
+
+_register_template(
+    name="codegeex2",
+    format_prefix=EmptyFormatter(slots=[{"token": "[gMASK]"}, {"token": "sop"}]),
+)
+
+
+_register_template(
+    name="codegeex4",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>\n"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
+    format_function=FunctionFormatter(slots=[], tool_format="glm4"),
+    format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>\n"]),
+    format_tools=ToolFormatter(tool_format="glm4"),
+    format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
+    default_system=(
+        "你是一位智能编程助手，你叫CodeGeeX。你会为用户回答关于编程、代码、计算机方面的任何问题，"
+        "并提供格式规范、可以执行、准确安全的代码，并在必要时提供详细的解释。"
+    ),
+    stop_words=["<|user|>", "<|observation|>"],
+    efficient_eos=True,
+)
+
+
+_register_template(
+    name="cohere",
+    format_user=StringFormatter(
+        slots=[
+            (
+                "<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{content}}<|END_OF_TURN_TOKEN|>"
+                "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"
+            )
+        ]
+    ),
+    format_system=StringFormatter(slots=["<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{content}}<|END_OF_TURN_TOKEN|>"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+)
+
+
+_register_template(
+    name="cpm",
+    format_user=StringFormatter(slots=["<用户>{{content}}<AI>"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+)
+
+
+_register_template(
+    name="dbrx",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    default_system=(
+        "You are DBRX, created by Databricks. You were last updated in December 2023. "
+        "You answer questions based on information available up to that point.\n"
+        "YOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, but provide thorough "
+        "responses to more complex and open-ended questions.\nYou assist with various tasks, "
+        "from writing to coding (using markdown for code blocks — remember to use ``` with "
+        "code, JSON, and tables).\n(You do not have real-time data access or code execution "
+        "capabilities. You avoid stereotyping and provide balanced perspectives on "
+        "controversial topics. You do not provide song lyrics, poems, or news articles and "
+        "do not divulge details of your training data.)\nThis is your system prompt, "
+        "guiding your responses. Do not reference it, just respond to the user. If you find "
+        "yourself talking about this message, stop. You should be responding appropriately "
+        "and usually that means not mentioning this.\nYOU DO NOT MENTION ANY OF THIS INFORMATION "
+        "ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY PERTINENT TO THE USER'S QUERY."
+    ),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+)
+
+
+_register_template(
+    name="deepseek",
+    format_user=StringFormatter(slots=["User: {{content}}\n\nAssistant:"]),
+    format_system=StringFormatter(slots=["{{content}}\n\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+)
+
+
+_register_template(
+    name="deepseekcoder",
+    format_user=StringFormatter(slots=["### Instruction:\n{{content}}\n### Response:"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}\n<|EOT|>"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    default_system=(
+        "You are an AI programming assistant, utilizing the DeepSeek Coder model, "
+        "developed by DeepSeek Company, and you only answer questions related to computer science. "
+        "For politically sensitive questions, security and privacy issues, "
+        "and other non-computer science questions, you will refuse to answer.\n"
+    ),
+)
+
+
+_register_template(
+    name="default",
+    format_user=StringFormatter(slots=["Human: {{content}}\nAssistant:"]),
+    format_system=StringFormatter(slots=["{{content}}\n"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+)
+
+
+_register_template(
+    name="empty",
+    efficient_eos=True,
+)
+
+
+_register_template(
+    name="falcon",
+    format_user=StringFormatter(slots=["User: {{content}}\nFalcon:"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    efficient_eos=True,
+)
+
+
+_register_template(
+    name="fewshot",
+    format_separator=EmptyFormatter(slots=["\n\n"]),
+    efficient_eos=True,
+)
+
+
+_register_template(
+    name="gemma",
+    format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
+    format_observation=StringFormatter(
+        slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
+    ),
+    format_separator=EmptyFormatter(slots=["<end_of_turn>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    efficient_eos=True,
+)
+
+
+_register_template(
+    name="glm4",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>"]),
+    format_assistant=StringFormatter(slots=["\n{{content}}"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}"]),
+    format_function=FunctionFormatter(slots=[], tool_format="glm4"),
+    format_observation=StringFormatter(slots=["<|observation|>\n{{content}}<|assistant|>"]),
+    format_tools=ToolFormatter(tool_format="glm4"),
+    format_prefix=EmptyFormatter(slots=["[gMASK]<sop>"]),
+    stop_words=["<|user|>", "<|observation|>"],
+    efficient_eos=True,
+)
+
+
+_register_template(
+    name="intern",
+    format_user=StringFormatter(slots=["<|User|>:{{content}}\n<|Bot|>:"]),
+    format_system=StringFormatter(slots=["<|System|>:{{content}}\n"]),
+    format_separator=EmptyFormatter(slots=["<eoa>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<eoa>"],
+    efficient_eos=True,  # internlm tokenizer cannot set eos_token_id
+)
+
+
+_register_template(
+    name="intern2",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_separator=EmptyFormatter(slots=["<|im_end|>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|im_end|>"],
+    efficient_eos=True,  # internlm2 tokenizer cannot set eos_token_id
+)
+
+
+_register_template(
+    name="llama2",
+    format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
+    format_system=StringFormatter(slots=["<<SYS>>\n{{content}}\n<</SYS>>\n\n"]),
+)
+
+
+_register_template(
+    name="llama2_zh",
+    format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
+    format_system=StringFormatter(slots=["<<SYS>>\n{{content}}\n<</SYS>>\n\n"]),
+    default_system="You are a helpful assistant. 你是一个乐于助人的助手。",
+)
+
+
+_register_template(
+    name="llama3",
+    format_user=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>user<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_system=StringFormatter(slots=["<|start_header_id|>system<|end_header_id|>\n\n{{content}}<|eot_id|>"]),
+    format_observation=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>tool<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|eot_id|>"],
+    replace_eos=True,
+)
+
+
+_register_template(
+    name="llava",
+    format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+    mm_plugin=get_mm_plugin(name="llava", image_token="<image>"),
+)
+
+
+_register_template(
+    name="mistral",
+    format_user=StringFormatter(slots=["[INST] {{content}} [/INST]"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+)
+
+
+_register_template(
+    name="olmo",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|assistant|>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"eos_token"}]),
+)
+
+
+_register_template(
+    name="openchat",
+    format_user=StringFormatter(slots=["GPT4 Correct User: {{content}}", {"eos_token"}, "GPT4 Correct Assistant:"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+)
+
+
+_register_template(
+    name="openchat-3.6",
+    format_user=StringFormatter(
+        slots=[
+            (
+                "<|start_header_id|>GPT4 Correct User<|end_header_id|>\n\n{{content}}<|eot_id|>"
+                "<|start_header_id|>GPT4 Correct Assistant<|end_header_id|>\n\n"
+            )
+        ]
+    ),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|eot_id|>"],
+    replace_eos=True,
+)
+
+
+_register_template(
+    name="orion",
+    format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: ", {"eos_token"}]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+)
+
+
+_register_template(
+    name="paligemma",
+    format_user=StringFormatter(slots=["<start_of_turn>user\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]),
+    format_observation=StringFormatter(
+        slots=["<start_of_turn>tool\n{{content}}<end_of_turn>\n<start_of_turn>model\n"]
+    ),
+    format_separator=EmptyFormatter(slots=["<end_of_turn>\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    efficient_eos=True,
+    mm_plugin=get_mm_plugin(name="paligemma", image_token="<image>"),
+)
+
+
+_register_template(
+    name="phi",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>\n"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}<|end|>\n"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    format_prefix=EmptyFormatter(slots=[{"bos_token"}]),
+    stop_words=["<|end|>"],
+    replace_eos=True,
+)
+
+
+_register_template(
+    name="qwen",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    default_system="You are a helpful assistant.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+)
+
+
+_register_template(
+    name="qwen2_vl",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    default_system="You are a helpful assistant.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(name="qwen2_vl", image_token="<|image_pad|>"),
+)
+
+
+_register_template(
+    name="sailor",
+    format_user=StringFormatter(slots=["<|im_start|>question\n{{content}}<|im_end|>\n<|im_start|>answer\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    default_system=(
+        "You are an AI assistant named Sailor created by Sea AI Lab. "
+        "Your answer should be friendly, unbiased, faithful, informative and detailed."
+    ),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+)
+
+
+_register_template(
+    name="solar",
+    format_user=StringFormatter(slots=["### User:\n{{content}}\n\n### Assistant:\n"]),
+    format_system=StringFormatter(slots=["### System:\n{{content}}\n\n"]),
+    efficient_eos=True,
+)
+
+
+_register_template(
+    name="starchat",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}<|end|>\n<|assistant|>"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}<|end|>\n"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    stop_words=["<|end|>"],
+    replace_eos=True,
+)
+
+
+_register_template(
+    name="telechat",
+    format_user=StringFormatter(slots=["<_user>{{content}}<_bot>"]),
+    format_system=StringFormatter(slots=["<_system>{{content}}<_end>"]),
+    stop_words=["<_end>"],
+    replace_eos=True,
+)
+
+
+_register_template(
+    name="vicuna",
+    format_user=StringFormatter(slots=["USER: {{content}} ASSISTANT:"]),
+    default_system=(
+        "A chat between a curious user and an artificial intelligence assistant. "
+        "The assistant gives helpful, detailed, and polite answers to the user's questions."
+    ),
+)
+
+
+_register_template(
+    name="xuanyuan",
+    format_user=StringFormatter(slots=["Human: {{content}} Assistant:"]),
+    default_system=(
+        "以下是用户和人工智能助手之间的对话。用户以Human开头，人工智能助手以Assistant开头，"
+        "会对人类提出的问题给出有帮助、高质量、详细和礼貌的回答，并且总是拒绝参与与不道德、"
+        "不安全、有争议、政治敏感等相关的话题、问题和指示。\n"
+    ),
+)
+
+
+_register_template(
+    name="xverse",
+    format_user=StringFormatter(slots=["Human: {{content}}\n\nAssistant: "]),
+)
+
+
+_register_template(
+    name="yayi",
+    format_user=StringFormatter(slots=[{"token": "<|Human|>"}, ":\n{{content}}\n\n", {"token": "<|YaYi|>"}, ":"]),
+    format_system=StringFormatter(slots=[{"token": "<|System|>"}, ":\n{{content}}\n\n"]),
+    format_separator=EmptyFormatter(slots=["\n\n"]),
+    default_system=(
+        "You are a helpful, respectful and honest assistant named YaYi "
+        "developed by Beijing Wenge Technology Co.,Ltd. "
+        "Always answer as helpfully as possible, while being safe.  "
+        "Your answers should not include any harmful, unethical, "
+        "racist, sexist, toxic, dangerous, or illegal content. "
+        "Please ensure that your responses are socially unbiased and positive in nature.\n\n"
+        "If a question does not make any sense, or is not factually coherent, "
+        "explain why instead of answering something not correct. "
+        "If you don't know the answer to a question, please don't share false information."
+    ),
+    stop_words=["<|End|>"],
+)
+
+
+_register_template(
+    name="yi",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+)
+
+
+_register_template(
+    name="yi_vl",
+    format_user=StringFormatter(slots=["### Human: {{content}}\n### Assistant:"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    default_system=(
+        "This is a chat between an inquisitive human and an AI assistant. "
+        "Assume the role of the AI assistant. Read all the images carefully, "
+        "and respond to the human's questions with informative, helpful, detailed and polite answers. "
+        "这是一个好奇的人类和一个人工智能助手之间的对话。假设你扮演这个AI助手的角色。"
+        "仔细阅读所有的图像，并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。\n\n"
+    ),
+    stop_words=["###"],
+    efficient_eos=True,
+    mm_plugin=get_mm_plugin(name="llava", image_token="<image>"),
+)
+
+
+_register_template(
+    name="yuan",
+    format_user=StringFormatter(slots=["{{content}}", {"token": "<sep>"}]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    stop_words=["<eod>"],
+    replace_eos=True,
+)
+
+
+_register_template(
+    name="zephyr",
+    format_user=StringFormatter(slots=["<|user|>\n{{content}}", {"eos_token"}, "<|assistant|>\n"]),
+    format_system=StringFormatter(slots=["<|system|>\n{{content}}", {"eos_token"}]),
+    default_system="You are Zephyr, a helpful assistant.",
+)
+
+
+_register_template(
+    name="ziya",
+    format_user=StringFormatter(slots=["<human>:{{content}}\n<bot>:"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+)
--- a/pretrain/LLaMA-Factory/src/llamafactory/data/tool_utils.py
+++ b/pretrain/LLaMA-Factory/src/llamafactory/data/tool_utils.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple, Union
+
+from .data_utils import SLOTS
+
+
+DEFAULT_TOOL_PROMPT = (
+    "You have access to the following tools:\n{tool_text}"
+    "Use the following format if using a tool:\n"
+    "```\n"
+    "Action: tool name (one of [{tool_names}])\n"
+    "Action Input: the input to the tool, in a JSON format representing the kwargs "
+    """(e.g. ```{{"input": "hello world", "num_beams": 5}}```)\n"""
+    "```\n"
+)
+
+
+GLM4_TOOL_PROMPT = (
+    "你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，"
+    "你的任务是针对用户的问题和要求提供适当的答复和支持。# 可用工具{tool_text}"
+)
+
+
+@dataclass
+class ToolUtils(ABC):
+    @staticmethod
+    @abstractmethod
+    def get_function_slots() -> SLOTS: ...
+
+    @staticmethod
+    @abstractmethod
+    def tool_formatter(tools: List[Dict[str, Any]]) -> str: ...
+
+    @staticmethod
+    @abstractmethod
+    def tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]: ...
+
+
+class DefaultToolUtils(ToolUtils):
+    @staticmethod
+    def get_function_slots() -> SLOTS:
+        return ["Action: {{name}}\nAction Input: {{arguments}}\n"]
+
+    @staticmethod
+    def tool_formatter(tools: List[Dict[str, Any]]) -> str:
+        tool_text = ""
+        tool_names = []
+        for tool in tools:
+            param_text = ""
+            for name, param in tool["parameters"]["properties"].items():
+                required, enum, items = "", "", ""
+                if name in tool["parameters"].get("required", []):
+                    required = ", required"
+
+                if param.get("enum", None):
+                    enum = ", should be one of [{}]".format(", ".join(param["enum"]))
+
+                if param.get("items", None):
+                    items = ", where each item should be {}".format(param["items"].get("type", ""))
+
+                param_text += "  - {name} ({type}{required}): {desc}{enum}{items}\n".format(
+                    name=name,
+                    type=param.get("type", ""),
+                    required=required,
+                    desc=param.get("description", ""),
+                    enum=enum,
+                    items=items,
+                )
+
+            tool_text += "> Tool Name: {name}\nTool Description: {desc}\nTool Args:\n{args}\n".format(
+                name=tool["name"], desc=tool.get("description", ""), args=param_text
+            )
+            tool_names.append(tool["name"])
+
+        return DEFAULT_TOOL_PROMPT.format(tool_text=tool_text, tool_names=", ".join(tool_names))
+
+    @staticmethod
+    def tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]:
+        regex = re.compile(r"Action:\s*([a-zA-Z0-9_]+)\s*Action Input:\s*(.+?)(?=\s*Action:|\s*$)", re.DOTALL)
+        action_match: List[Tuple[str, str]] = re.findall(regex, content)
+        if not action_match:
+            return content
+
+        results = []
+        for match in action_match:
+            tool_name = match[0].strip()
+            tool_input = match[1].strip().strip('"').strip("```")
+            try:
+                arguments = json.loads(tool_input)
+                results.append((tool_name, json.dumps(arguments, ensure_ascii=False)))
+            except json.JSONDecodeError:
+                return content
+
+        return results
+
+
+class GLM4ToolUtils(ToolUtils):
+    @staticmethod
+    def get_function_slots() -> SLOTS:
+        return ["{{name}}\n{{arguments}}"]
+
+    @staticmethod
+    def tool_formatter(tools: List[Dict[str, Any]]) -> str:
+        tool_text = ""
+        for tool in tools:
+            tool_text += "\n\n## {name}\n\n{body}\n在调用上述函数时，请使用 Json 格式表示调用的参数。".format(
+                name=tool["name"], body=json.dumps(tool, indent=4, ensure_ascii=False)
+            )
+
+        return GLM4_TOOL_PROMPT.format(tool_text=tool_text)
+
+    @staticmethod
+    def tool_extractor(content: str) -> Union[str, List[Tuple[str, str]]]:
+        if "\n" not in content:
+            return content
+
+        tool_name, tool_input = content.split("\n", maxsplit=1)
+        try:
+            arguments = json.loads(tool_input)
+        except json.JSONDecodeError:
+            return content
+
+        return [(tool_name, json.dumps(arguments, ensure_ascii=False))]
--- a/pretrain/LLaMA-Factory/tests/data/processors/test_feedback.py
+++ b/pretrain/LLaMA-Factory/tests/data/processors/test_feedback.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+
+import pytest
+from datasets import load_dataset
+from transformers import AutoTokenizer
+
+from llamafactory.extras.constants import IGNORE_INDEX
+from llamafactory.train.test_utils import load_train_dataset
+
+
+DEMO_DATA = os.environ.get("DEMO_DATA", "llamafactory/demo_data")
+
+TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "stage": "kto",
+    "do_train": True,
+    "finetuning_type": "full",
+    "dataset": "kto_en_demo",
+    "dataset_dir": "REMOTE:" + DEMO_DATA,
+    "template": "llama3",
+    "cutoff_len": 8192,
+    "overwrite_cache": True,
+    "output_dir": "dummy_dir",
+    "overwrite_output_dir": True,
+    "fp16": True,
+}
+
+
+@pytest.mark.parametrize("num_samples", [16])
+def test_feedback_data(num_samples: int):
+    train_dataset = load_train_dataset(**TRAIN_ARGS)
+    ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA)
+    original_data = load_dataset(DEMO_DATA, name="kto_en_demo", split="train")
+    indexes = random.choices(range(len(original_data)), k=num_samples)
+    for index in indexes:
+        messages = original_data["messages"][index]
+        ref_input_ids = ref_tokenizer.apply_chat_template(messages)
+        prompt_len = len(ref_tokenizer.apply_chat_template(messages[:-1], add_generation_prompt=True))
+        ref_labels = [IGNORE_INDEX] * prompt_len + ref_input_ids[prompt_len:]
+        assert train_dataset["input_ids"][index] == ref_input_ids
+        assert train_dataset["labels"][index] == ref_labels
+        assert train_dataset["kto_tags"][index] == original_data["label"][index]
--- a/pretrain/LLaMA-Factory/tests/data/processors/test_pairwise.py
+++ b/pretrain/LLaMA-Factory/tests/data/processors/test_pairwise.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+from typing import Dict, List
+
+import pytest
+from datasets import load_dataset
+from transformers import AutoTokenizer
+
+from llamafactory.extras.constants import IGNORE_INDEX
+from llamafactory.train.test_utils import load_train_dataset
+
+
+DEMO_DATA = os.environ.get("DEMO_DATA", "llamafactory/demo_data")
+
+TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "stage": "rm",
+    "do_train": True,
+    "finetuning_type": "full",
+    "dataset": "dpo_en_demo",
+    "dataset_dir": "REMOTE:" + DEMO_DATA,
+    "template": "llama3",
+    "cutoff_len": 8192,
+    "overwrite_cache": True,
+    "output_dir": "dummy_dir",
+    "overwrite_output_dir": True,
+    "fp16": True,
+}
+
+
+def _convert_sharegpt_to_openai(messages: List[Dict[str, str]]) -> List[Dict[str, str]]:
+    role_mapping = {"human": "user", "gpt": "assistant", "system": "system"}
+    new_messages = []
+    for message in messages:
+        new_messages.append({"role": role_mapping[message["from"]], "content": message["value"]})
+
+    return new_messages
+
+
+@pytest.mark.parametrize("num_samples", [16])
+def test_pairwise_data(num_samples: int):
+    train_dataset = load_train_dataset(**TRAIN_ARGS)
+    ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA)
+    original_data = load_dataset(DEMO_DATA, name="dpo_en_demo", split="train")
+    indexes = random.choices(range(len(original_data)), k=num_samples)
+    for index in indexes:
+        chosen_messages = original_data["conversations"][index] + [original_data["chosen"][index]]
+        rejected_messages = original_data["conversations"][index] + [original_data["rejected"][index]]
+        chosen_messages = _convert_sharegpt_to_openai(chosen_messages)
+        rejected_messages = _convert_sharegpt_to_openai(rejected_messages)
+        ref_chosen_input_ids = ref_tokenizer.apply_chat_template(chosen_messages)
+        chosen_prompt_len = len(ref_tokenizer.apply_chat_template(chosen_messages[:-1], add_generation_prompt=True))
+        ref_chosen_labels = [IGNORE_INDEX] * chosen_prompt_len + ref_chosen_input_ids[chosen_prompt_len:]
+        ref_rejected_input_ids = ref_tokenizer.apply_chat_template(rejected_messages)
+        rejected_prompt_len = len(
+            ref_tokenizer.apply_chat_template(rejected_messages[:-1], add_generation_prompt=True)
+        )
+        ref_rejected_labels = [IGNORE_INDEX] * rejected_prompt_len + ref_rejected_input_ids[rejected_prompt_len:]
+        assert train_dataset["chosen_input_ids"][index] == ref_chosen_input_ids
+        assert train_dataset["chosen_labels"][index] == ref_chosen_labels
+        assert train_dataset["rejected_input_ids"][index] == ref_rejected_input_ids
+        assert train_dataset["rejected_labels"][index] == ref_rejected_labels
--- a/pretrain/LLaMA-Factory/tests/data/processors/test_processor_utils.py
+++ b/pretrain/LLaMA-Factory/tests/data/processors/test_processor_utils.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+import pytest
+
+from llamafactory.data.processors.processor_utils import infer_seqlen
+
+
+@pytest.mark.parametrize(
+    "test_input,test_output",
+    [
+        ((3000, 2000, 1000), (600, 400)),
+        ((2000, 3000, 1000), (400, 600)),
+        ((1000, 100, 1000), (900, 100)),
+        ((100, 1000, 1000), (100, 900)),
+        ((100, 500, 1000), (100, 500)),
+        ((500, 100, 1000), (500, 100)),
+        ((10, 10, 1000), (10, 10)),
+    ],
+)
+def test_infer_seqlen(test_input: Tuple[int, int, int], test_output: Tuple[int, int]):
+    assert test_output == infer_seqlen(*test_input)
--- a/pretrain/LLaMA-Factory/tests/data/processors/test_supervised.py
+++ b/pretrain/LLaMA-Factory/tests/data/processors/test_supervised.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+
+import pytest
+from datasets import load_dataset
+from transformers import AutoTokenizer
+
+from llamafactory.extras.constants import IGNORE_INDEX
+from llamafactory.train.test_utils import load_train_dataset
+
+
+DEMO_DATA = os.environ.get("DEMO_DATA", "llamafactory/demo_data")
+
+TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+
+TINY_DATA = os.environ.get("TINY_DATA", "llamafactory/tiny-supervised-dataset")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "stage": "sft",
+    "do_train": True,
+    "finetuning_type": "full",
+    "template": "llama3",
+    "cutoff_len": 8192,
+    "overwrite_cache": True,
+    "output_dir": "dummy_dir",
+    "overwrite_output_dir": True,
+    "fp16": True,
+}
+
+
+@pytest.mark.parametrize("num_samples", [16])
+def test_supervised_single_turn(num_samples: int):
+    train_dataset = load_train_dataset(dataset_dir="ONLINE", dataset=TINY_DATA, **TRAIN_ARGS)
+    ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA)
+    original_data = load_dataset(TINY_DATA, split="train")
+    indexes = random.choices(range(len(original_data)), k=num_samples)
+    for index in indexes:
+        prompt = original_data["instruction"][index]
+        if original_data["input"][index]:
+            prompt += "\n" + original_data["input"][index]
+
+        messages = [
+            {"role": "user", "content": prompt},
+            {"role": "assistant", "content": original_data["output"][index]},
+        ]
+        ref_input_ids = ref_tokenizer.apply_chat_template(messages)
+        assert train_dataset["input_ids"][index] == ref_input_ids
+
+
+@pytest.mark.parametrize("num_samples", [8])
+def test_supervised_multi_turn(num_samples: int):
+    train_dataset = load_train_dataset(dataset_dir="REMOTE:" + DEMO_DATA, dataset="system_chat", **TRAIN_ARGS)
+    ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA)
+    original_data = load_dataset(DEMO_DATA, name="system_chat", split="train")
+    indexes = random.choices(range(len(original_data)), k=num_samples)
+    for index in indexes:
+        ref_input_ids = ref_tokenizer.apply_chat_template(original_data["messages"][index])
+        assert train_dataset["input_ids"][index] == ref_input_ids
+
+
+@pytest.mark.parametrize("num_samples", [4])
+def test_supervised_train_on_prompt(num_samples: int):
+    train_dataset = load_train_dataset(
+        dataset_dir="REMOTE:" + DEMO_DATA, dataset="system_chat", train_on_prompt=True, **TRAIN_ARGS
+    )
+    ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA)
+    original_data = load_dataset(DEMO_DATA, name="system_chat", split="train")
+    indexes = random.choices(range(len(original_data)), k=num_samples)
+    for index in indexes:
+        ref_ids = ref_tokenizer.apply_chat_template(original_data["messages"][index])
+        assert train_dataset["input_ids"][index] == ref_ids
+        assert train_dataset["labels"][index] == ref_ids
+
+
+@pytest.mark.parametrize("num_samples", [4])
+def test_supervised_mask_history(num_samples: int):
+    train_dataset = load_train_dataset(
+        dataset_dir="REMOTE:" + DEMO_DATA, dataset="system_chat", mask_history=True, **TRAIN_ARGS
+    )
+    ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA)
+    original_data = load_dataset(DEMO_DATA, name="system_chat", split="train")
+    indexes = random.choices(range(len(original_data)), k=num_samples)
+    for index in indexes:
+        messages = original_data["messages"][index]
+        ref_input_ids = ref_tokenizer.apply_chat_template(messages)
+        prompt_len = len(ref_tokenizer.apply_chat_template(messages[:-1], add_generation_prompt=True))
+        ref_label_ids = [IGNORE_INDEX] * prompt_len + ref_input_ids[prompt_len:]
+        assert train_dataset["input_ids"][index] == ref_input_ids
+        assert train_dataset["labels"][index] == ref_label_ids
--- a/pretrain/LLaMA-Factory/tests/data/processors/test_unsupervised.py
+++ b/pretrain/LLaMA-Factory/tests/data/processors/test_unsupervised.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import random
+
+import pytest
+from datasets import load_dataset
+from transformers import AutoTokenizer
+
+from llamafactory.train.test_utils import load_train_dataset
+
+
+DEMO_DATA = os.environ.get("DEMO_DATA", "llamafactory/demo_data")
+
+TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+
+TINY_DATA = os.environ.get("TINY_DATA", "llamafactory/tiny-supervised-dataset")
+
+TRAIN_ARGS = {
+    "model_name_or_path": TINY_LLAMA,
+    "stage": "ppo",
+    "do_train": True,
+    "finetuning_type": "full",
+    "reward_model": "",
+    "reward_model_type": "full",
+    "dataset": "system_chat",
+    "dataset_dir": "REMOTE:" + DEMO_DATA,
+    "template": "llama3",
+    "cutoff_len": 8192,
+    "overwrite_cache": True,
+    "output_dir": "dummy_dir",
+    "overwrite_output_dir": True,
+    "fp16": True,
+}
+
+
+@pytest.mark.parametrize("num_samples", [16])
+def test_unsupervised_data(num_samples: int):
+    train_dataset = load_train_dataset(**TRAIN_ARGS)
+    ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA)
+    original_data = load_dataset(DEMO_DATA, name="system_chat", split="train")
+    indexes = random.choices(range(len(original_data)), k=num_samples)
+    for index in indexes:
+        messages = original_data["messages"][index]
+        ref_ids = ref_tokenizer.apply_chat_template(messages)
+        ref_input_ids = ref_tokenizer.apply_chat_template(messages[:-1], add_generation_prompt=True)
+        ref_labels = ref_ids[len(ref_input_ids) :]
+        assert train_dataset["input_ids"][index] == ref_input_ids
+        assert train_dataset["labels"][index] == ref_labels
--- a/pretrain/LLaMA-Factory/tests/data/test_collator.py
+++ b/pretrain/LLaMA-Factory/tests/data/test_collator.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from llamafactory.data.collator import prepare_4d_attention_mask
+
+
+def test_4d_attention_mask():
+    o = 0.0
+    x = torch.finfo(torch.float16).min
+    attention_mask_with_indices = torch.tensor(
+        [
+            [1, 1, 2, 2, 2, 0],
+            [1, 2, 2, 3, 3, 3],
+        ]
+    )
+    attention_mask_computed = prepare_4d_attention_mask(attention_mask_with_indices, torch.float16)
+    attention_mask_expected = torch.tensor(
+        [
+            [
+                [
+                    [o, x, x, x, x, x],
+                    [o, o, x, x, x, x],
+                    [x, x, o, x, x, x],
+                    [x, x, o, o, x, x],
+                    [x, x, o, o, o, x],
+                    [x, x, x, x, x, x],
+                ]
+            ],
+            [
+                [
+                    [o, x, x, x, x, x],
+                    [x, o, x, x, x, x],
+                    [x, o, o, x, x, x],
+                    [x, x, x, o, x, x],
+                    [x, x, x, o, o, x],
+                    [x, x, x, o, o, o],
+                ]
+            ],
+        ],
+        dtype=torch.float16,
+    )
+    assert list(attention_mask_computed.size()) == [2, 1, 6, 6]
+    assert torch.all(attention_mask_computed == attention_mask_expected)
--- a/pretrain/LLaMA-Factory/tests/data/test_formatter.py
+++ b/pretrain/LLaMA-Factory/tests/data/test_formatter.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+
+from llamafactory.data.formatter import EmptyFormatter, FunctionFormatter, StringFormatter, ToolFormatter
+
+
+def test_empty_formatter():
+    formatter = EmptyFormatter(slots=["\n"])
+    assert formatter.apply() == ["\n"]
+
+
+def test_string_formatter():
+    formatter = StringFormatter(slots=["<s>", "Human: {{content}}\nAssistant:"])
+    assert formatter.apply(content="Hi") == ["<s>", "Human: Hi\nAssistant:"]
+
+
+def test_function_formatter():
+    formatter = FunctionFormatter(slots=[], tool_format="default")
+    tool_calls = json.dumps({"name": "tool_name", "arguments": {"foo": "bar", "size": 10}})
+    assert formatter.apply(content=tool_calls) == [
+        """Action: tool_name\nAction Input: {\"foo\": \"bar\", \"size\": 10}\n"""
+    ]
+
+
+def test_multi_function_formatter():
+    formatter = FunctionFormatter(slots=[], tool_format="default")
+    tool_calls = json.dumps([{"name": "tool_name", "arguments": {"foo": "bar", "size": 10}}] * 2)
+    assert formatter.apply(content=tool_calls) == [
+        """Action: tool_name\nAction Input: {\"foo\": \"bar\", \"size\": 10}\n""",
+        """Action: tool_name\nAction Input: {\"foo\": \"bar\", \"size\": 10}\n""",
+    ]
+
+
+def test_default_tool_formatter():
+    formatter = ToolFormatter(tool_format="default")
+    tools = [
+        {
+            "name": "test_tool",
+            "description": "tool_desc",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "foo": {"type": "string", "description": "foo_desc"},
+                    "bar": {"type": "number", "description": "bar_desc"},
+                },
+                "required": ["foo"],
+            },
+        }
+    ]
+    assert formatter.apply(content=json.dumps(tools)) == [
+        "You have access to the following tools:\n"
+        "> Tool Name: test_tool\n"
+        "Tool Description: tool_desc\n"
+        "Tool Args:\n"
+        "  - foo (string, required): foo_desc\n"
+        "  - bar (number): bar_desc\n\n"
+        "Use the following format if using a tool:\n"
+        "```\n"
+        "Action: tool name (one of [test_tool])\n"
+        "Action Input: the input to the tool, in a JSON format representing the kwargs "
+        """(e.g. ```{"input": "hello world", "num_beams": 5}```)\n"""
+        "```\n"
+    ]
+
+
+def test_default_tool_extractor():
+    formatter = ToolFormatter(tool_format="default")
+    result = """Action: test_tool\nAction Input: {"foo": "bar", "size": 10}\n"""
+    assert formatter.extract(result) == [("test_tool", """{"foo": "bar", "size": 10}""")]
+
+
+def test_default_multi_tool_extractor():
+    formatter = ToolFormatter(tool_format="default")
+    result = (
+        """Action: test_tool\nAction Input: {"foo": "bar", "size": 10}\n"""
+        """Action: another_tool\nAction Input: {"foo": "job", "size": 2}\n"""
+    )
+    assert formatter.extract(result) == [
+        ("test_tool", """{"foo": "bar", "size": 10}"""),
+        ("another_tool", """{"foo": "job", "size": 2}"""),
+    ]
+
+
+def test_glm4_tool_formatter():
+    formatter = ToolFormatter(tool_format="glm4")
+    tools = [
+        {
+            "name": "test_tool",
+            "description": "tool_desc",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "foo": {"type": "string", "description": "foo_desc"},
+                    "bar": {"type": "number", "description": "bar_desc"},
+                },
+                "required": ["foo"],
+            },
+        }
+    ]
+    assert formatter.apply(content=json.dumps(tools)) == [
+        "你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，"
+        "你的任务是针对用户的问题和要求提供适当的答复和支持。# 可用工具\n\n"
+        "## test_tool\n\n{}\n在调用上述函数时，请使用 Json 格式表示调用的参数。".format(json.dumps(tools[0], indent=4))
+    ]
+
+
+def test_glm4_tool_extractor():
+    formatter = ToolFormatter(tool_format="glm4")
+    result = """test_tool\n{"foo": "bar", "size": 10}\n"""
+    assert formatter.extract(result) == [("test_tool", """{"foo": "bar", "size": 10}""")]
--- a/pretrain/LLaMA-Factory/tests/data/test_template.py
+++ b/pretrain/LLaMA-Factory/tests/data/test_template.py
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from typing import TYPE_CHECKING, List, Sequence
+
+import pytest
+from transformers import AutoTokenizer
+
+from llamafactory.data import get_template_and_fix_tokenizer
+
+
+if TYPE_CHECKING:
+    from transformers import PreTrainedTokenizer
+
+
+HF_TOKEN = os.environ.get("HF_TOKEN", None)
+
+TINY_LLAMA = os.environ.get("TINY_LLAMA", "llamafactory/tiny-random-Llama-3")
+
+MESSAGES = [
+    {"role": "user", "content": "How are you"},
+    {"role": "assistant", "content": "I am fine!"},
+    {"role": "user", "content": "你好"},
+    {"role": "assistant", "content": "很高兴认识你！"},
+]
+
+
+def _check_tokenization(
+    tokenizer: "PreTrainedTokenizer", batch_input_ids: Sequence[Sequence[int]], batch_text: Sequence[str]
+) -> None:
+    for input_ids, text in zip(batch_input_ids, batch_text):
+        assert input_ids == tokenizer.encode(text, add_special_tokens=False)
+        assert tokenizer.decode(input_ids) == text
+
+
+def _check_single_template(
+    model_id: str, template_name: str, prompt_str: str, answer_str: str, extra_str: str, use_fast: bool
+) -> List[str]:
+    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=use_fast, token=HF_TOKEN)
+    content_str = tokenizer.apply_chat_template(MESSAGES, tokenize=False)
+    content_ids = tokenizer.apply_chat_template(MESSAGES, tokenize=True)
+    template = get_template_and_fix_tokenizer(tokenizer, name=template_name)
+    prompt_ids, answer_ids = template.encode_oneturn(tokenizer, MESSAGES)
+    assert content_str == prompt_str + answer_str + extra_str
+    assert content_ids == prompt_ids + answer_ids + tokenizer.encode(extra_str, add_special_tokens=False)
+    _check_tokenization(tokenizer, (prompt_ids, answer_ids), (prompt_str, answer_str))
+    return content_ids
+
+
+def _check_template(model_id: str, template_name: str, prompt_str: str, answer_str: str, extra_str: str = "") -> None:
+    """
+    Checks template for both the slow tokenizer and the fast tokenizer.
+
+    Args:
+        model_id: the model id on hugging face hub.
+        template_name: the template name.
+        prompt_str: the string corresponding to the prompt part.
+        answer_str: the string corresponding to the answer part.
+        extra_str: the extra string in the jinja template of the original tokenizer.
+    """
+    slow_ids = _check_single_template(model_id, template_name, prompt_str, answer_str, extra_str, use_fast=False)
+    fast_ids = _check_single_template(model_id, template_name, prompt_str, answer_str, extra_str, use_fast=True)
+    assert slow_ids == fast_ids
+
+
+@pytest.mark.parametrize("use_fast", [True, False])
+def test_encode_oneturn(use_fast: bool):
+    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA, use_fast=use_fast)
+    template = get_template_and_fix_tokenizer(tokenizer, name="llama3")
+    prompt_ids, answer_ids = template.encode_oneturn(tokenizer, MESSAGES)
+    prompt_str = (
+        "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|>"
+        "<|start_header_id|>assistant<|end_header_id|>\n\nI am fine!<|eot_id|>"
+        "<|start_header_id|>user<|end_header_id|>\n\n你好<|eot_id|>"
+        "<|start_header_id|>assistant<|end_header_id|>\n\n"
+    )
+    answer_str = "很高兴认识你！<|eot_id|>"
+    _check_tokenization(tokenizer, (prompt_ids, answer_ids), (prompt_str, answer_str))
+
+
+@pytest.mark.parametrize("use_fast", [True, False])
+def test_encode_multiturn(use_fast: bool):
+    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA, use_fast=use_fast)
+    template = get_template_and_fix_tokenizer(tokenizer, name="llama3")
+    encoded_pairs = template.encode_multiturn(tokenizer, MESSAGES)
+    prompt_str_1 = (
+        "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|>"
+        "<|start_header_id|>assistant<|end_header_id|>\n\n"
+    )
+    answer_str_1 = "I am fine!<|eot_id|>"
+    prompt_str_2 = (
+        "<|start_header_id|>user<|end_header_id|>\n\n你好<|eot_id|>"
+        "<|start_header_id|>assistant<|end_header_id|>\n\n"
+    )
+    answer_str_2 = "很高兴认识你！<|eot_id|>"
+    _check_tokenization(
+        tokenizer,
+        (encoded_pairs[0][0], encoded_pairs[0][1], encoded_pairs[1][0], encoded_pairs[1][1]),
+        (prompt_str_1, answer_str_1, prompt_str_2, answer_str_2),
+    )
+
+
+@pytest.mark.parametrize("use_fast", [True, False])
+def test_jinja_template(use_fast: bool):
+    tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA, use_fast=use_fast)
+    ref_tokenizer = AutoTokenizer.from_pretrained(TINY_LLAMA, use_fast=use_fast)
+    get_template_and_fix_tokenizer(tokenizer, name="llama3")
+    assert tokenizer.chat_template != ref_tokenizer.chat_template
+    assert tokenizer.apply_chat_template(MESSAGES) == ref_tokenizer.apply_chat_template(MESSAGES)
+
+
+@pytest.mark.skipif(not HF_TOKEN, reason="Gated model.")
+def test_gemma_template():
+    prompt_str = (
+        "<bos><start_of_turn>user\nHow are you<end_of_turn>\n"
+        "<start_of_turn>model\nI am fine!<end_of_turn>\n"
+        "<start_of_turn>user\n你好<end_of_turn>\n"
+        "<start_of_turn>model\n"
+    )
+    answer_str = "很高兴认识你！"
+    _check_template("google/gemma-2-9b-it", "gemma", prompt_str, answer_str, extra_str="<end_of_turn>\n")
+
+
+@pytest.mark.skipif(not HF_TOKEN, reason="Gated model.")
+def test_llama3_template():
+    prompt_str = (
+        "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|>"
+        "<|start_header_id|>assistant<|end_header_id|>\n\nI am fine!<|eot_id|>"
+        "<|start_header_id|>user<|end_header_id|>\n\n你好<|eot_id|>"
+        "<|start_header_id|>assistant<|end_header_id|>\n\n"
+    )
+    answer_str = "很高兴认识你！<|eot_id|>"
+    _check_template("meta-llama/Meta-Llama-3-8B-Instruct", "llama3", prompt_str, answer_str)
+
+
+def test_qwen_template():
+    prompt_str = (
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
+        "<|im_start|>user\nHow are you<|im_end|>\n"
+        "<|im_start|>assistant\nI am fine!<|im_end|>\n"
+        "<|im_start|>user\n你好<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+    answer_str = "很高兴认识你！<|im_end|>"
+    _check_template("Qwen/Qwen2-7B-Instruct", "qwen", prompt_str, answer_str, extra_str="\n")
+
+
+@pytest.mark.skip(reason="The fast tokenizer of Yi model is corrupted.")
+def test_yi_template():
+    prompt_str = (
+        "<|im_start|>user\nHow are you<|im_end|>\n"
+        "<|im_start|>assistant\nI am fine!<|im_end|>\n"
+        "<|im_start|>user\n你好<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+    answer_str = "很高兴认识你！<|im_end|>"
+    _check_template("01-ai/Yi-1.5-6B-Chat", "yi", prompt_str, answer_str)