Commit 191a00d3 by nzy

utils: remove read_config

parent acf44fdb
model = "/path/to/model"
apps = "/path/to/apps_dataset"
[sample]
sample_prompt_path = "path"
sample_result_path = "path"
[sample.sampling_params]
n = 0
temperature = 0.6
max_new_tokens = 2048
[evaluate]
evaluate_result_path = ""
[dataset]
train_path = ""
test_path = ""
minimal_test_path = ""
\ No newline at end of file
from utils import read_config
import argparse
from pathlib import Path
from utils import save_json
from utils_vllm import vllm_chatcomplete
from step1_sample_apps import mk_sample_prompt
from step1_evaluate_code import evaluate
from step1_sort_split_dataset import sort_and_split_dataset
from step1_sort_split_dataset import sort_and_split_dataset
if __name__ == "__main__":
cfg = read_config()
mk_sample_prompt(cfg["model"], cfg["apps"], cfg["sample"]["sample_prompt_path"])
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str)
parser.add_argument("--apps", type=str)
parser.add_argument("--output_dir", type=str)
args = parser.parse_args()
vllm_chatcomplete(
cfg["model"],
cfg["sample"]["sample_prompt_path"],
cfg["sample"]["sample_result_path"],
cfg["sample"]["sampling_params"],
)
output_dir = Path(args.output_dir)
output_dir.mkdir(exist_ok=True)
evaluate(
cfg["sample"]["sample_result_path"],
cfg["apps"],
cfg["evaluate"]["evaluate_result_path"],
)
prompt_path = output_dir / "prompt.jsonl"
mk_sample_prompt(args.model, args.apps, prompt_path)
code_path = output_dir / "sample.jsonl"
sampling_params = dict(n=50, temperature=0.6, max_new_tokens=2048)
save_json(sampling_params, output_dir / "sampling_params.json")
vllm_chatcomplete(args.model, prompt_path, code_path, sampling_params)
dataset_path = output_dir / "dataset.jsonl"
evaluate(code_path, args.apps, dataset_path)
train_path = output_dir / "train.jsonl"
test_path = output_dir / "test.jsonl"
min_test_path = output_dir / "min_test.jsonl"
sort_and_split_dataset(
cfg["evaluate"]["evaluate_result_path"],
cfg["dataset"]["train_path"],
cfg["dataset"]["test_path"],
cfg["dataset"]["minimal_test_path"],
cfg["sample"]["sampling_params"]["n"]
)
\ No newline at end of file
dataset_path, train_path, test_path, min_test_path, sampling_params["n"]
)
from utils import load_jsonl, save_json, extract_code, read_config
import argparse
from pathlib import Path
from utils import load_json, load_jsonl, save_json, save_jsonl, extract_code
from utils_dataset import mk_preference_dataset_info, mk_preference_pair, save_dataset
from nltk.metrics.distance import edit_distance
from collections import defaultdict
......@@ -88,28 +90,25 @@ def mk_edit_distance_dataset(all_pairs, k, n, is_max=True):
if __name__ == "__main__":
cfg = read_config()
problems = mk_problem_groups(
cfg["dataset"]["train_path"], cfg["sample"]["sampling_params"]["n"]
)
parser = argparse.ArgumentParser()
parser.add_argument("--dataset_dir", type=str)
parser.add_argument("--llamafactory", type=str)
parser.add_argument("--is_max", type=bool, required=True)
args = parser.parse_args()
dataset_dir = Path(args.dataset_dir)
train_path = dataset_dir / "train.jsonl"
sampling_params = load_json(dataset_dir / "sampling_params.json")
problems = mk_problem_groups(train_path, sampling_params["n"])
all_edit_distance_pairs = calculate_edit_distances(problems)
# Maximum distance
postfix = "max" if args.is_max else "min"
dataset_name = f"apps_edit_distance_{postfix}"
preference_pairs, metadata = mk_edit_distance_dataset(
all_edit_distance_pairs, 10 * 1000, 5, is_max=True
all_edit_distance_pairs, 10 * 1000, 5, is_max=args.is_max
)
max_dataset_cfg = cfg["preference_dataset"]["max_edit_distance"]
dataset_info = mk_preference_dataset_info(max_dataset_cfg["dataset_name"])
save_json(metadata, max_dataset_cfg["metadata_path"])
save_dataset(cfg["llamafactory_path"], dataset_info, preference_pairs)
# Minimum distance
preference_pairs, metadata = mk_edit_distance_dataset(
all_edit_distance_pairs, 10 * 1000, 5, is_max=False
)
min_dataset_cfg = cfg["preference_dataset"]["min_edit_distance"]
dataset_info = mk_preference_dataset_info(min_dataset_cfg["dataset_name"])
save_json(metadata, min_dataset_cfg["metadata_path"])
save_dataset(cfg["llamafactory_path"], dataset_info, preference_pairs)
dataset_info = mk_preference_dataset_info(dataset_name)
save_json(metadata, dataset_dir / f"{dataset_name}_metadata.json")
save_dataset(args.llamafactory, dataset_info, preference_pairs)
......@@ -47,26 +47,3 @@ def extract_code(text: str):
def code_similarity(ref, pred):
return calc_codebleu([ref], [pred], lang="python", weights=(0, 0.5, 0.5, 0))
def read_config(arg_lst=None):
import argparse
argparser = argparse.ArgumentParser()
argparser.add_argument("--config", type=str)
if arg_lst:
for arg in arg_lst:
argparser.add_argument(f"--{arg}", type=str)
args = argparser.parse_args()
with open(args.config, "rb") as f:
cfg = tomllib.load(f)
args_dict = vars(args)
for arg, val in args_dict.items():
assert arg not in cfg.keys()
cfg[arg] = val
return cfg
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment