Commit b78d979f by nzy

utils_dataset: save dataset to llamafactory's data directory

parent 4b69d51d
...@@ -23,14 +23,11 @@ minimal_test_path = "" ...@@ -23,14 +23,11 @@ minimal_test_path = ""
[preference_dataset.max_edit_distance] [preference_dataset.max_edit_distance]
dataset_name = "" dataset_name = ""
metadata_path = "" metadata_path = ""
preference_dataset_path = ""
dataset_info_path = ""
[preference_dataset.min_edit_distance] [preference_dataset.min_edit_distance]
dataset_name = "" dataset_name = ""
metadata_path = "" metadata_path = ""
preference_dataset_path = ""
dataset_info_path = ""
[orm.max_edit_distance] [orm.max_edit_distance]
model_path = "" model_path = ""
......
from utils import load_jsonl, save_json, extract_code, read_config from utils import load_jsonl, save_json, extract_code, read_config
from utils_preference_dataset import mk_preference_dataset_info, mk_preference_pair from utils_dataset import mk_preference_dataset_info, mk_preference_pair, save_dataset
from nltk.metrics.distance import edit_distance from nltk.metrics.distance import edit_distance
from collections import defaultdict from collections import defaultdict
from itertools import product, chain from itertools import product, chain
...@@ -103,14 +103,7 @@ if __name__ == "__main__": ...@@ -103,14 +103,7 @@ if __name__ == "__main__":
dataset_info = mk_preference_dataset_info(max_dataset_cfg["dataset_name"]) dataset_info = mk_preference_dataset_info(max_dataset_cfg["dataset_name"])
save_json(metadata, max_dataset_cfg["metadata_path"]) save_json(metadata, max_dataset_cfg["metadata_path"])
save_json( save_dataset(cfg["llamafactory_path"], dataset_info, preference_pairs)
preference_pairs,
max_dataset_cfg["preference_dataset_path"],
)
save_json(
dataset_info,
max_dataset_cfg["dataset_info_path"],
)
# Minimum distance # Minimum distance
preference_pairs, metadata = mk_edit_distance_dataset( preference_pairs, metadata = mk_edit_distance_dataset(
...@@ -119,11 +112,4 @@ if __name__ == "__main__": ...@@ -119,11 +112,4 @@ if __name__ == "__main__":
min_dataset_cfg = cfg["preference_dataset"]["min_edit_distance"] min_dataset_cfg = cfg["preference_dataset"]["min_edit_distance"]
dataset_info = mk_preference_dataset_info(min_dataset_cfg["dataset_name"]) dataset_info = mk_preference_dataset_info(min_dataset_cfg["dataset_name"])
save_json(metadata, min_dataset_cfg["metadata_path"]) save_json(metadata, min_dataset_cfg["metadata_path"])
save_json( save_dataset(cfg["llamafactory_path"], dataset_info, preference_pairs)
preference_pairs,
min_dataset_cfg["preference_dataset_path"],
)
save_json(
dataset_info,
min_dataset_cfg["dataset_info_path"],
)
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
# This experiment aims to fairly compare these two approaches. # This experiment aims to fairly compare these two approaches.
from utils import load_json, save_json, read_config from utils import load_json, save_json, read_config
from utils_preference_dataset import mk_critic_qa, mk_critic_verify, mk_sft_item, mk_sft_dataset_info from utils_dataset import mk_critic_qa, mk_critic_verify, mk_sft_item, mk_sft_dataset_info, save_dataset
def convert_preference_to_sft(item): def convert_preference_to_sft(item):
...@@ -29,5 +29,4 @@ if __name__ == "__main__": ...@@ -29,5 +29,4 @@ if __name__ == "__main__":
sft_dataset.extend(convert_preference_to_sft(item)) sft_dataset.extend(convert_preference_to_sft(item))
dataset_info = mk_sft_dataset_info(cfg["sftorm"]["dataset_name"]) dataset_info = mk_sft_dataset_info(cfg["sftorm"]["dataset_name"])
save_json(sft_dataset, cfg["sftorm"]["dataset_path"]) save_dataset(cfg["llamafactory_path"], dataset_info, sft_dataset)
save_json(dataset_info, cfg["sftorm"]["dataset_info_path"])
...@@ -8,7 +8,7 @@ from copy import deepcopy ...@@ -8,7 +8,7 @@ from copy import deepcopy
from utils import load_jsonl, save_jsonl, extract_code, read_config from utils import load_jsonl, save_jsonl, extract_code, read_config
from utils_metric import group_results, score_pass_at_k from utils_metric import group_results, score_pass_at_k
from utils_preference_dataset import code_template from utils_dataset import code_template
from transformers import AutoTokenizer from transformers import AutoTokenizer
......
from utils_vllm import vllm_score from utils_vllm import vllm_score
from utils import read_config, load_jsonl, save_jsonl, extract_code from utils import read_config, load_jsonl, save_jsonl, extract_code
from utils_preference_dataset import code_template, mk_critic_qa, mk_critic_verify from utils_dataset import code_template, mk_critic_qa, mk_critic_verify
from utils_metric import group_results, score_pass_at_k from utils_metric import group_results, score_pass_at_k
from transformers import AutoTokenizer from transformers import AutoTokenizer
......
...@@ -27,9 +27,9 @@ def save_jsonl(data, file_path): ...@@ -27,9 +27,9 @@ def save_jsonl(data, file_path):
for item in data: for item in data:
f.write(json.dumps(item) + "\n") f.write(json.dumps(item) + "\n")
def save_json(data, file_path): def save_json(data, file_path, indent=None):
with open(file_path, "w", encoding="utf-8") as f: with open(file_path, "w", encoding="utf-8") as f:
json.dump(data, f) json.dump(data, f, indent=indent)
codeblock_pattern = re.compile(r"```python(.+?)```", flags=re.DOTALL) codeblock_pattern = re.compile(r"```python(.+?)```", flags=re.DOTALL)
......
from utils import load_json, save_json
def mk_preference_dataset_info(dataset_name): def mk_preference_dataset_info(dataset_name):
return { return {
dataset_name: { dataset_name: {
...@@ -21,6 +24,7 @@ def mk_preference_dataset_info(dataset_name): ...@@ -21,6 +24,7 @@ def mk_preference_dataset_info(dataset_name):
# see utils.extract_code # see utils.extract_code
# TODO Check the code format in dataset
code_template = r"```python{}```" code_template = r"```python{}```"
...@@ -76,3 +80,15 @@ def mk_critic_verify(answer=None): ...@@ -76,3 +80,15 @@ def mk_critic_verify(answer=None):
message.append({"role": "assistant", "content": response}) message.append({"role": "assistant", "content": response})
return message return message
def save_dataset(llamafactory_path, dataset_info, dataset):
all_dataset_info_path = f"{llamafactory_path}/data/dataset_info.json"
all_dataset_info = load_json(all_dataset_info_path)
all_dataset_info |= dataset_info
save_json(all_dataset_info, all_dataset_info_path, indent=4)
assert len(dataset_info.keys()) == 1
dataset_name = dataset_info.keys()[0]
dataset_relative_path = dataset_info[dataset_name]["file_name"]
save_json(dataset, f"{llamafactory_path}/data/{dataset_relative_path}")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment