Commit 78b884ce by nzy

refactor: edit_distance

todo: move the functions in utils.data to data
parent afcf4289
import argparse
from pathlib import Path
from codecritic.utils.json import load_json
from codecritic.utils.data import save_jsonl_dataset
from codecritic.data.edit_distance import (
mk_problem_groups,
calculate_edit_distances,
mk_edit_distance_dataset,
)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--dataset_dir", type=str)
parser.add_argument("--output_dir", type=str)
parser.add_argument("--is_max", type=bool, required=True)
args = parser.parse_args()
dataset_dir = Path(args.dataset_dir)
train_path = dataset_dir / "train.jsonl"
sampling_params = load_json(dataset_dir / "sampling_params.json")
problems = mk_problem_groups(train_path, sampling_params["n"])
all_edit_distance_pairs = calculate_edit_distances(problems)
postfix = "max" if args.is_max else "min"
dataset_name = f"apps_edit_distance_{postfix}"
preference_pairs, metadata = mk_edit_distance_dataset(
all_edit_distance_pairs, 10 * 1000, 5, is_max=args.is_max
)
save_jsonl_dataset(preference_pairs, args.output_dir)
import argparse
from pathlib import Path
from codecritic.utils.json import load_json, load_jsonl
from codecritic.utils.data import extract_code, mk_preference_pair, save_jsonl_dataset
from codecritic.utils.json import load_jsonl
from codecritic.utils.data import extract_code
from nltk.metrics.distance import edit_distance
from collections import defaultdict
from itertools import product, chain
import multiprocessing
from tqdm.contrib.concurrent import process_map
def mk_preference_pair(instruction, chosen_code, rejected_code):
return {
"messages": [
{"role": "user", "content": instruction},
],
"chosen": {"role": "assistant", "content": code_template.format(chosen_code)},
"rejected": {
"role": "assistant",
"content": code_template.format(rejected_code),
},
}
def mk_problem_groups(train_dataset_path, n):
train_dataset = load_jsonl(train_dataset_path)
......@@ -86,27 +96,4 @@ def mk_edit_distance_dataset(all_pairs, k, n, is_max=True):
preference_pairs.append(mk_preference_pair(instr, pair[0], pair[1]))
pairs_metadata.append(dict(problem_id=problem_id, edit_distance=distance))
return preference_pairs, pairs_metadata
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--dataset_dir", type=str)
parser.add_argument("--output_dir", type=str)
parser.add_argument("--is_max", type=bool, required=True)
args = parser.parse_args()
dataset_dir = Path(args.dataset_dir)
train_path = dataset_dir / "train.jsonl"
sampling_params = load_json(dataset_dir / "sampling_params.json")
problems = mk_problem_groups(train_path, sampling_params["n"])
all_edit_distance_pairs = calculate_edit_distances(problems)
postfix = "max" if args.is_max else "min"
dataset_name = f"apps_edit_distance_{postfix}"
preference_pairs, metadata = mk_edit_distance_dataset(
all_edit_distance_pairs, 10 * 1000, 5, is_max=args.is_max
)
save_jsonl_dataset(preference_pairs, args.output_dir)
return preference_pairs, pairs_metadata
\ No newline at end of file
......@@ -19,19 +19,6 @@ def extract_code(text: str):
return ""
def mk_preference_pair(instruction, chosen_code, rejected_code):
return {
"messages": [
{"role": "user", "content": instruction},
],
"chosen": {"role": "assistant", "content": code_template.format(chosen_code)},
"rejected": {
"role": "assistant",
"content": code_template.format(rejected_code),
},
}
# Note that the human and observation should appear in odd positions
# while llm should appear in even positions.
def mk_messages(messages):
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment