import argparse
from pathlib import Path
from codecritic.utils.json import load_json
from codecritic.data.utils import save_jsonl_dataset

from codecritic.data.edit_distance import (
    mk_problem_groups,
    calculate_edit_distances,
    mk_edit_distance_dataset,
)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--dataset_dir", type=str)
    parser.add_argument("--output_dir", type=str)
    parser.add_argument("--is_max", type=bool, required=True)
    args = parser.parse_args()

    dataset_dir = Path(args.dataset_dir)
    train_path = dataset_dir / "train.jsonl"
    sampling_params = load_json(dataset_dir / "sampling_params.json")
    problems = mk_problem_groups(train_path, sampling_params["n"])

    all_edit_distance_pairs = calculate_edit_distances(problems)

    postfix = "max" if args.is_max else "min"
    dataset_name = f"apps_edit_distance_{postfix}"
    preference_pairs, metadata = mk_edit_distance_dataset(
        all_edit_distance_pairs, 10 * 1000, 5, is_max=args.is_max
    )

    save_jsonl_dataset(preference_pairs, args.output_dir)
