Commit 04e7f84d by nzy

utils: finish todo: refactor vllm_chatcomplete and vllm_score, and other…

utils: finish todo: refactor vllm_chatcomplete and vllm_score, and other functions in step1. Don't save results in function.
parent 0c079f4d
...@@ -137,8 +137,7 @@ def evaluate_incorrect_code_samples_again(results, apps, loop_num): ...@@ -137,8 +137,7 @@ def evaluate_incorrect_code_samples_again(results, apps, loop_num):
return correct_lst return correct_lst
def evaluate(code_sample_path, dataset_path, output_path): def evaluate(code_samples, dataset_path):
code_samples = load_jsonl(code_sample_path)
apps = load_dataset(dataset_path) apps = load_dataset(dataset_path)
results = evaluate_code_samples(code_samples, apps) results = evaluate_code_samples(code_samples, apps)
for item in results: for item in results:
...@@ -146,4 +145,4 @@ def evaluate(code_sample_path, dataset_path, output_path): ...@@ -146,4 +145,4 @@ def evaluate(code_sample_path, dataset_path, output_path):
item["eval_result"] = bool(np.all(np.asarray(item["testcase"]) > 0)) item["eval_result"] = bool(np.all(np.asarray(item["testcase"]) > 0))
results = evaluate_incorrect_code_samples_again(results, apps, 10) results = evaluate_incorrect_code_samples_again(results, apps, 10)
save_jsonl(results, output_path) return results
import argparse import argparse
from pathlib import Path from pathlib import Path
from utils import save_json from utils import save_json, save_jsonl
from utils_vllm import vllm_chatcomplete from utils_vllm import vllm_chatcomplete
from step1_sample_apps import mk_sample_prompt from step1_sample_apps import mk_sample_prompt
from step1_evaluate_code import evaluate from step1_evaluate_code import evaluate
...@@ -17,20 +17,17 @@ if __name__ == "__main__": ...@@ -17,20 +17,17 @@ if __name__ == "__main__":
output_dir = Path(args.output_dir) output_dir = Path(args.output_dir)
output_dir.mkdir(exist_ok=True) output_dir.mkdir(exist_ok=True)
prompt_path = output_dir / "prompt.jsonl" prompts = mk_sample_prompt(args.model, args.apps)
mk_sample_prompt(args.model, args.apps, prompt_path)
code_path = output_dir / "sample.jsonl"
sampling_params = dict(n=50, temperature=0.6, max_new_tokens=2048) sampling_params = dict(n=50, temperature=0.6, max_new_tokens=2048)
save_json(sampling_params, output_dir / "sampling_params.json") save_json(sampling_params, output_dir / "sampling_params.json")
vllm_chatcomplete(args.model, prompt_path, code_path, sampling_params) codes = vllm_chatcomplete(args.model, prompts, sampling_params)
save_jsonl(codes, output_dir / "sample.jsonl")
dataset_path = output_dir / "dataset.jsonl" dataset = evaluate(codes, args.apps)
evaluate(code_path, args.apps, dataset_path) save_jsonl(dataset, output_dir / "dataset.jsonl")
train_path = output_dir / "train.jsonl" train, test, min_test = sort_and_split_dataset(dataset, sampling_params["n"])
test_path = output_dir / "test.jsonl" save_jsonl(train, output_dir / "train.jsonl")
min_test_path = output_dir / "min_test.jsonl" save_jsonl(test, output_dir / "test.jsonl")
sort_and_split_dataset( save_jsonl(min_test, output_dir / "min_test.jsonl")
dataset_path, train_path, test_path, min_test_path, sampling_params["n"]
)
...@@ -28,7 +28,7 @@ def mk_prompt(doc) -> str: ...@@ -28,7 +28,7 @@ def mk_prompt(doc) -> str:
return conversation return conversation
def mk_sample_prompt(model_path, apps_path, output_path): def mk_sample_prompt(model_path, apps_path):
tokenizer = AutoTokenizer.from_pretrained(model_path) tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token tokenizer.pad_token = tokenizer.eos_token
...@@ -58,5 +58,4 @@ def mk_sample_prompt(model_path, apps_path, output_path): ...@@ -58,5 +58,4 @@ def mk_sample_prompt(model_path, apps_path, output_path):
prompts.append(dict(problem_id=problem_id, messages=prompt)) prompts.append(dict(problem_id=problem_id, messages=prompt))
print(f"size of dataset: {len(prompts)}") return prompts
save_jsonl(prompts, output_path)
...@@ -24,14 +24,13 @@ def is_in_test_range(prefix_weight, idx): ...@@ -24,14 +24,13 @@ def is_in_test_range(prefix_weight, idx):
return False return False
def sort_and_split_dataset(raw_dataset_path, new_train_path, new_test_path, minimal_test_path, n): def sort_and_split_dataset(dataset, n):
""" """
The dataset will be divided into two parts: Train and Test. The dataset will be divided into two parts: Train and Test.
From the Test set, 10% of items across varying difficulties will be selected. From the Test set, 10% of items across varying difficulties will be selected.
Among these, only those items for which the LLM can generate correct solutions will be included in the minimal testset. Among these, only those items for which the LLM can generate correct solutions will be included in the minimal testset.
This approach reduces the test time by approximately 1/5. This approach reduces the test time by approximately 1/5.
""" """
dataset = load_jsonl(raw_dataset_path)
# add `key_for_sort` # add `key_for_sort`
new_train, new_test = [], [] new_train, new_test = [], []
...@@ -53,6 +52,6 @@ def sort_and_split_dataset(raw_dataset_path, new_train_path, new_test_path, mini ...@@ -53,6 +52,6 @@ def sort_and_split_dataset(raw_dataset_path, new_train_path, new_test_path, mini
if has_correct_solution: if has_correct_solution:
minimal_test.extend(problem) minimal_test.extend(problem)
save_jsonl(new_train, new_train_path) return new_train, new_test, minimal_test
save_jsonl(new_test, new_test_path)
save_jsonl(minimal_test, minimal_test_path)
...@@ -40,17 +40,13 @@ if __name__ == "__main__": ...@@ -40,17 +40,13 @@ if __name__ == "__main__":
parser.add_argument("--preference_dataset", type=str) parser.add_argument("--preference_dataset", type=str)
parser.add_argument("--llamafactory", type=str) parser.add_argument("--llamafactory", type=str)
parser.add_argument("--dataset_name", type=str) parser.add_argument("--dataset_name", type=str)
parser.add_argument("--output_dir", type=str)
args = parser.parse_args() args = parser.parse_args()
output_dir = Path(args.output_dir)
preference_dataset = load_json(args.preference_dataset) preference_dataset = load_json(args.preference_dataset)
cov_prompts = list(chain(*convert_preference_to_vot_prompt(preference_dataset))) cov_prompts = list(chain(*convert_preference_to_vot_prompt(preference_dataset)))
sampling_params = dict(n=1, temperature=0.8, max_tokens=2048) sampling_params = dict(n=1, temperature=0.8, max_tokens=2048)
reason_path = output_dir / "cov.jsonl" covs = vllm_chatcomplete(args.model, cov_prompts, sampling_params)
covs = vllm_chatcomplete(args.model, cov_prompts, reason_path, sampling_params)
dataset = list(map(convert_cov_to_cov_dataset, covs)) dataset = list(map(convert_cov_to_cov_dataset, covs))
dataset_info = mk_sft_dataset_info(args.dataset_name) dataset_info = mk_sft_dataset_info(args.dataset_name)
......
...@@ -35,16 +35,14 @@ def run_sft_model(model_path, test_path, apps_path, reason_prompt=None): ...@@ -35,16 +35,14 @@ def run_sft_model(model_path, test_path, apps_path, reason_prompt=None):
if reason_prompt: if reason_prompt:
test_dataset = [append_prompt(x, COV_PROMPT) for x in test_dataset] test_dataset = [append_prompt(x, COV_PROMPT) for x in test_dataset]
sampling_params = dict(n=1, temperature=0.0, max_tokens=2048) sampling_params = dict(n=1, temperature=0.0, max_tokens=2048)
reason_path = result_dir / "reason.jsonl" test_dataset = vllm_chatcomplete(model_path, test_dataset, sampling_params)
test_dataset = vllm_chatcomplete(
model_path, test_dataset, reason_path, sampling_params
)
# score # score
score_path = result_dir / "scores.jsonl"
score_token = get_score_token_id(model_path) score_token = get_score_token_id(model_path)
test_dataset = [append_prompt(x, JUDGE_PROMPT) for x in test_dataset] test_dataset = [append_prompt(x, JUDGE_PROMPT) for x in test_dataset]
results = vllm_score(model_path, test_dataset, score_path, score_token) results = vllm_score(model_path, test_dataset, score_token)
score_path = result_dir / "scores.jsonl"
save_jsonl(results, score_path)
# compute pass@k # compute pass@k
eval_result_path = result_dir / "passk.jsonl" eval_result_path = result_dir / "passk.jsonl"
......
...@@ -78,12 +78,7 @@ def score_worker(cuda_device, prompts, model_path, score_token): ...@@ -78,12 +78,7 @@ def score_worker(cuda_device, prompts, model_path, score_token):
return result return result
def vllm_chatcomplete(model_path, prompts, output_path, sampling_params): def vllm_chatcomplete(model_path, prompts, sampling_params):
if isinstance(prompts, str):
prompts = load_jsonl(prompts)
else:
assert isinstance(prompts, list)
# Respect the slurm's gpu allocation # Respect the slurm's gpu allocation
cuda_devices = os.environ["CUDA_VISIBLE_DEVICES"].split(',') cuda_devices = os.environ["CUDA_VISIBLE_DEVICES"].split(',')
gpu_num = len(cuda_devices) gpu_num = len(cuda_devices)
...@@ -100,14 +95,10 @@ def vllm_chatcomplete(model_path, prompts, output_path, sampling_params): ...@@ -100,14 +95,10 @@ def vllm_chatcomplete(model_path, prompts, output_path, sampling_params):
nested_results = pool.starmap(worker_llm, args) nested_results = pool.starmap(worker_llm, args)
results = list(chain(*nested_results)) results = list(chain(*nested_results))
print(f"size of dataset: {len(results)}")
save_jsonl(results, output_path)
return results return results
def vllm_score(model_path, prompt_path, output_path, score_token): def vllm_score(model_path, prompts, score_token):
prompts = load_jsonl(prompt_path)
# Respect the slurm's gpu allocation # Respect the slurm's gpu allocation
cuda_devices = os.environ["CUDA_VISIBLE_DEVICES"].split(',') cuda_devices = os.environ["CUDA_VISIBLE_DEVICES"].split(',')
gpu_num = len(cuda_devices) gpu_num = len(cuda_devices)
...@@ -124,6 +115,4 @@ def vllm_score(model_path, prompt_path, output_path, score_token): ...@@ -124,6 +115,4 @@ def vllm_score(model_path, prompt_path, output_path, score_token):
nested_results = pool.starmap(worker_llm, args) nested_results = pool.starmap(worker_llm, args)
results = list(chain(*nested_results)) results = list(chain(*nested_results))
print(f"size of dataset: {len(results)}")
save_jsonl(results, output_path)
return results return results
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment