Commit 04e7f84d by nzy

utils: finish todo: refactor vllm_chatcomplete and vllm_score, and other…

utils: finish todo: refactor vllm_chatcomplete and vllm_score, and other functions in step1. Don't save results in function.
parent 0c079f4d
......@@ -137,8 +137,7 @@ def evaluate_incorrect_code_samples_again(results, apps, loop_num):
return correct_lst
def evaluate(code_sample_path, dataset_path, output_path):
code_samples = load_jsonl(code_sample_path)
def evaluate(code_samples, dataset_path):
apps = load_dataset(dataset_path)
results = evaluate_code_samples(code_samples, apps)
for item in results:
......@@ -146,4 +145,4 @@ def evaluate(code_sample_path, dataset_path, output_path):
item["eval_result"] = bool(np.all(np.asarray(item["testcase"]) > 0))
results = evaluate_incorrect_code_samples_again(results, apps, 10)
save_jsonl(results, output_path)
return results
import argparse
from pathlib import Path
from utils import save_json
from utils import save_json, save_jsonl
from utils_vllm import vllm_chatcomplete
from step1_sample_apps import mk_sample_prompt
from step1_evaluate_code import evaluate
......@@ -17,20 +17,17 @@ if __name__ == "__main__":
output_dir = Path(args.output_dir)
output_dir.mkdir(exist_ok=True)
prompt_path = output_dir / "prompt.jsonl"
mk_sample_prompt(args.model, args.apps, prompt_path)
prompts = mk_sample_prompt(args.model, args.apps)
code_path = output_dir / "sample.jsonl"
sampling_params = dict(n=50, temperature=0.6, max_new_tokens=2048)
save_json(sampling_params, output_dir / "sampling_params.json")
vllm_chatcomplete(args.model, prompt_path, code_path, sampling_params)
codes = vllm_chatcomplete(args.model, prompts, sampling_params)
save_jsonl(codes, output_dir / "sample.jsonl")
dataset_path = output_dir / "dataset.jsonl"
evaluate(code_path, args.apps, dataset_path)
dataset = evaluate(codes, args.apps)
save_jsonl(dataset, output_dir / "dataset.jsonl")
train_path = output_dir / "train.jsonl"
test_path = output_dir / "test.jsonl"
min_test_path = output_dir / "min_test.jsonl"
sort_and_split_dataset(
dataset_path, train_path, test_path, min_test_path, sampling_params["n"]
)
train, test, min_test = sort_and_split_dataset(dataset, sampling_params["n"])
save_jsonl(train, output_dir / "train.jsonl")
save_jsonl(test, output_dir / "test.jsonl")
save_jsonl(min_test, output_dir / "min_test.jsonl")
......@@ -28,7 +28,7 @@ def mk_prompt(doc) -> str:
return conversation
def mk_sample_prompt(model_path, apps_path, output_path):
def mk_sample_prompt(model_path, apps_path):
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
......@@ -58,5 +58,4 @@ def mk_sample_prompt(model_path, apps_path, output_path):
prompts.append(dict(problem_id=problem_id, messages=prompt))
print(f"size of dataset: {len(prompts)}")
save_jsonl(prompts, output_path)
return prompts
......@@ -24,14 +24,13 @@ def is_in_test_range(prefix_weight, idx):
return False
def sort_and_split_dataset(raw_dataset_path, new_train_path, new_test_path, minimal_test_path, n):
def sort_and_split_dataset(dataset, n):
"""
The dataset will be divided into two parts: Train and Test.
From the Test set, 10% of items across varying difficulties will be selected.
Among these, only those items for which the LLM can generate correct solutions will be included in the minimal testset.
This approach reduces the test time by approximately 1/5.
"""
dataset = load_jsonl(raw_dataset_path)
# add `key_for_sort`
new_train, new_test = [], []
......@@ -53,6 +52,6 @@ def sort_and_split_dataset(raw_dataset_path, new_train_path, new_test_path, mini
if has_correct_solution:
minimal_test.extend(problem)
save_jsonl(new_train, new_train_path)
save_jsonl(new_test, new_test_path)
save_jsonl(minimal_test, minimal_test_path)
return new_train, new_test, minimal_test
......@@ -40,17 +40,13 @@ if __name__ == "__main__":
parser.add_argument("--preference_dataset", type=str)
parser.add_argument("--llamafactory", type=str)
parser.add_argument("--dataset_name", type=str)
parser.add_argument("--output_dir", type=str)
args = parser.parse_args()
output_dir = Path(args.output_dir)
preference_dataset = load_json(args.preference_dataset)
cov_prompts = list(chain(*convert_preference_to_vot_prompt(preference_dataset)))
sampling_params = dict(n=1, temperature=0.8, max_tokens=2048)
reason_path = output_dir / "cov.jsonl"
covs = vllm_chatcomplete(args.model, cov_prompts, reason_path, sampling_params)
covs = vllm_chatcomplete(args.model, cov_prompts, sampling_params)
dataset = list(map(convert_cov_to_cov_dataset, covs))
dataset_info = mk_sft_dataset_info(args.dataset_name)
......
......@@ -35,16 +35,14 @@ def run_sft_model(model_path, test_path, apps_path, reason_prompt=None):
if reason_prompt:
test_dataset = [append_prompt(x, COV_PROMPT) for x in test_dataset]
sampling_params = dict(n=1, temperature=0.0, max_tokens=2048)
reason_path = result_dir / "reason.jsonl"
test_dataset = vllm_chatcomplete(
model_path, test_dataset, reason_path, sampling_params
)
test_dataset = vllm_chatcomplete(model_path, test_dataset, sampling_params)
# score
score_path = result_dir / "scores.jsonl"
score_token = get_score_token_id(model_path)
test_dataset = [append_prompt(x, JUDGE_PROMPT) for x in test_dataset]
results = vllm_score(model_path, test_dataset, score_path, score_token)
results = vllm_score(model_path, test_dataset, score_token)
score_path = result_dir / "scores.jsonl"
save_jsonl(results, score_path)
# compute pass@k
eval_result_path = result_dir / "passk.jsonl"
......
......@@ -78,12 +78,7 @@ def score_worker(cuda_device, prompts, model_path, score_token):
return result
def vllm_chatcomplete(model_path, prompts, output_path, sampling_params):
if isinstance(prompts, str):
prompts = load_jsonl(prompts)
else:
assert isinstance(prompts, list)
def vllm_chatcomplete(model_path, prompts, sampling_params):
# Respect the slurm's gpu allocation
cuda_devices = os.environ["CUDA_VISIBLE_DEVICES"].split(',')
gpu_num = len(cuda_devices)
......@@ -100,14 +95,10 @@ def vllm_chatcomplete(model_path, prompts, output_path, sampling_params):
nested_results = pool.starmap(worker_llm, args)
results = list(chain(*nested_results))
print(f"size of dataset: {len(results)}")
save_jsonl(results, output_path)
return results
def vllm_score(model_path, prompt_path, output_path, score_token):
prompts = load_jsonl(prompt_path)
def vllm_score(model_path, prompts, score_token):
# Respect the slurm's gpu allocation
cuda_devices = os.environ["CUDA_VISIBLE_DEVICES"].split(',')
gpu_num = len(cuda_devices)
......@@ -124,6 +115,4 @@ def vllm_score(model_path, prompt_path, output_path, score_token):
nested_results = pool.starmap(worker_llm, args)
results = list(chain(*nested_results))
print(f"size of dataset: {len(results)}")
save_jsonl(results, output_path)
return results
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment