utils: finish todo: refactor vllm_chatcomplete and vllm_score, and other…

utils: finish todo: refactor vllm_chatcomplete and vllm_score, and other functions in step1. Don't save results in function.

utils: finish todo: refactor vllm_chatcomplete and vllm_score, and other…
utils: finish todo: refactor vllm_chatcomplete and vllm_score, and other functions in step1. Don't save results in function.
04e7f84d · nzy · 0c079f4d · 04e7f84d · 04e7f84d · 04e7f84d
Commit 04e7f84d authored Oct 29, 2024 by nzy
Showing with 25 additions and 48 deletions

step1_evaluate_code.py
+2 -3

step1_run.py
+10 -13

step1_sample_apps.py
+2 -3

step1_sort_split_dataset.py
+4 -5

step2_cov_dataset.py
+1 -5

step4_test_sft.py
+4 -6

utils_vllm.py
+2 -13

No files found.
--- a/step1_evaluate_code.py
+++ b/step1_evaluate_code.py
@@ -137,8 +137,7 @@ def evaluate_incorrect_code_samples_again(results, apps, loop_num):
    return correct_lst
-def evaluate(code_sample_path, dataset_path, output_path):
+def evaluate(code_samples, dataset_path):
-    code_samples = load_jsonl(code_sample_path)
    apps = load_dataset(dataset_path)
    results = evaluate_code_samples(code_samples, apps)
    for item in results:
@@ -146,4 +145,4 @@ def evaluate(code_sample_path, dataset_path, output_path):
        item["eval_result"] = bool(np.all(np.asarray(item["testcase"]) > 0))
    results = evaluate_incorrect_code_samples_again(results, apps, 10)
-    save_jsonl(results, output_path)
+    return results
--- a/step1_run.py
+++ b/step1_run.py
 import argparse
 from pathlib import Path
-from utils import save_json
+from utils import save_json, save_jsonl
 from utils_vllm import vllm_chatcomplete
 from step1_sample_apps import mk_sample_prompt
 from step1_evaluate_code import evaluate
@@ -17,20 +17,17 @@ if __name__ == "__main__":
    output_dir = Path(args.output_dir)
    output_dir.mkdir(exist_ok=True)
-    prompt_path = output_dir / "prompt.jsonl"
+    prompts = mk_sample_prompt(args.model, args.apps)
-    mk_sample_prompt(args.model, args.apps, prompt_path)
-    code_path = output_dir / "sample.jsonl"
    sampling_params = dict(n=50, temperature=0.6, max_new_tokens=2048)
    save_json(sampling_params, output_dir / "sampling_params.json")
-    vllm_chatcomplete(args.model, prompt_path, code_path, sampling_params)
+    codes = vllm_chatcomplete(args.model, prompts, sampling_params)
+    save_jsonl(codes, output_dir / "sample.jsonl")
-    dataset_path = output_dir / "dataset.jsonl"
+    dataset = evaluate(codes, args.apps)
-    evaluate(code_path, args.apps, dataset_path)
+    save_jsonl(dataset, output_dir / "dataset.jsonl")
-    train_path = output_dir / "train.jsonl"
+    train, test, min_test = sort_and_split_dataset(dataset, sampling_params["n"])
-    test_path = output_dir / "test.jsonl"
+    save_jsonl(train, output_dir / "train.jsonl")
-    min_test_path = output_dir / "min_test.jsonl"
+    save_jsonl(test, output_dir / "test.jsonl")
-    sort_and_split_dataset(
+    save_jsonl(min_test, output_dir / "min_test.jsonl")
-        dataset_path, train_path, test_path, min_test_path, sampling_params["n"]
-    )
--- a/step1_sample_apps.py
+++ b/step1_sample_apps.py
@@ -28,7 +28,7 @@ def mk_prompt(doc) -> str:
    return conversation
-def mk_sample_prompt(model_path, apps_path, output_path):
+def mk_sample_prompt(model_path, apps_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    tokenizer.pad_token = tokenizer.eos_token
@@ -58,5 +58,4 @@ def mk_sample_prompt(model_path, apps_path, output_path):
            prompts.append(dict(problem_id=problem_id, messages=prompt))
-    print(f"size of dataset: {len(prompts)}")
+    return prompts
-    save_jsonl(prompts, output_path)
--- a/step1_sort_split_dataset.py
+++ b/step1_sort_split_dataset.py
@@ -24,14 +24,13 @@ def is_in_test_range(prefix_weight, idx):
    return False
-def sort_and_split_dataset(raw_dataset_path, new_train_path, new_test_path, minimal_test_path, n):
+def sort_and_split_dataset(dataset, n):
    """
    The dataset will be divided into two parts: Train and Test.
    From the Test set, 10% of items across varying difficulties will be selected.
    Among these, only those items for which the LLM can generate correct solutions will be included in the minimal testset.
    This approach reduces the test time by approximately 1/5.
    """
-    dataset = load_jsonl(raw_dataset_path)
    # add `key_for_sort`
    new_train, new_test = [], []
@@ -53,6 +52,6 @@ def sort_and_split_dataset(raw_dataset_path, new_train_path, new_test_path, mini
        if has_correct_solution:
            minimal_test.extend(problem)
-    save_jsonl(new_train, new_train_path)
+    return new_train, new_test, minimal_test
-    save_jsonl(new_test, new_test_path)
-    save_jsonl(minimal_test, minimal_test_path)
--- a/step2_cov_dataset.py
+++ b/step2_cov_dataset.py
@@ -40,17 +40,13 @@ if __name__ == "__main__":
    parser.add_argument("--preference_dataset", type=str)
    parser.add_argument("--llamafactory", type=str)
    parser.add_argument("--dataset_name", type=str)
-    parser.add_argument("--output_dir", type=str)
    args = parser.parse_args()
-    output_dir = Path(args.output_dir)
    preference_dataset = load_json(args.preference_dataset)
    cov_prompts = list(chain(*convert_preference_to_vot_prompt(preference_dataset)))
    sampling_params = dict(n=1, temperature=0.8, max_tokens=2048)
-    reason_path = output_dir / "cov.jsonl"
+    covs = vllm_chatcomplete(args.model, cov_prompts, sampling_params)
-    covs = vllm_chatcomplete(args.model, cov_prompts, reason_path, sampling_params)
    dataset = list(map(convert_cov_to_cov_dataset, covs))
    dataset_info = mk_sft_dataset_info(args.dataset_name)

--- a/step4_test_sft.py
+++ b/step4_test_sft.py
@@ -35,16 +35,14 @@ def run_sft_model(model_path, test_path, apps_path, reason_prompt=None):
    if reason_prompt:
        test_dataset = [append_prompt(x, COV_PROMPT) for x in test_dataset]
        sampling_params = dict(n=1, temperature=0.0, max_tokens=2048)
-        reason_path = result_dir / "reason.jsonl"
+        test_dataset = vllm_chatcomplete(model_path, test_dataset, sampling_params)
-        test_dataset = vllm_chatcomplete(
-            model_path, test_dataset, reason_path, sampling_params
-        )
    # score
-    score_path = result_dir / "scores.jsonl"
    score_token = get_score_token_id(model_path)
    test_dataset = [append_prompt(x, JUDGE_PROMPT) for x in test_dataset]
-    results = vllm_score(model_path, test_dataset, score_path, score_token)
+    results = vllm_score(model_path, test_dataset, score_token)
+    score_path = result_dir / "scores.jsonl"
+    save_jsonl(results, score_path)
    # compute pass@k
    eval_result_path = result_dir / "passk.jsonl"

--- a/utils_vllm.py
+++ b/utils_vllm.py
@@ -78,12 +78,7 @@ def score_worker(cuda_device, prompts, model_path, score_token):
    return result
-def vllm_chatcomplete(model_path, prompts, output_path, sampling_params):
+def vllm_chatcomplete(model_path, prompts, sampling_params):
-    if isinstance(prompts, str):
-        prompts = load_jsonl(prompts)
-    else:
-        assert isinstance(prompts, list)
    # Respect the slurm's gpu allocation
    cuda_devices = os.environ["CUDA_VISIBLE_DEVICES"].split(',')
    gpu_num = len(cuda_devices)
@@ -100,14 +95,10 @@ def vllm_chatcomplete(model_path, prompts, output_path, sampling_params):
        nested_results = pool.starmap(worker_llm, args)
    results = list(chain(*nested_results))
-    print(f"size of dataset: {len(results)}")
-    save_jsonl(results, output_path)
    return results
-def vllm_score(model_path, prompt_path, output_path, score_token):
+def vllm_score(model_path, prompts, score_token):
-    prompts = load_jsonl(prompt_path)
    # Respect the slurm's gpu allocation
    cuda_devices = os.environ["CUDA_VISIBLE_DEVICES"].split(',')
    gpu_num = len(cuda_devices)
@@ -124,6 +115,4 @@ def vllm_score(model_path, prompt_path, output_path, score_token):
        nested_results = pool.starmap(worker_llm, args)
    results = list(chain(*nested_results))
-    print(f"size of dataset: {len(results)}")
-    save_jsonl(results, output_path)
    return results