Commit 91cf4380 by nzy

improve eval_apps

parent b319f162
# CodeCritic # CodeCritic
## Installation
```
pip install scikit-learn
pip install
```
## Evaluation ## Evaluation
### APPS-500 (Top@k) ### APPS-500 (Top@k)
......
import argparse import argparse
from pathlib import Path
import os import os
import json import json
from functools import partial from functools import partial
...@@ -14,7 +15,7 @@ from codecritic.dataset.code import extract_code ...@@ -14,7 +15,7 @@ from codecritic.dataset.code import extract_code
from codecritic.evaluation.apps_eval import evaluate from codecritic.evaluation.apps_eval import evaluate
from codecritic.utils.inference import generate_worker from codecritic.utils.inference import generate_worker
from codecritic.utils.parallel import model_map from codecritic.utils.parallel import model_map
from codecritic.utils.json import save_jsonl from codecritic.utils.json import load_jsonl, save_jsonl
def transform_to_prompt(apps, tokenizer): def transform_to_prompt(apps, tokenizer):
...@@ -56,49 +57,60 @@ if __name__ == "__main__": ...@@ -56,49 +57,60 @@ if __name__ == "__main__":
parser.add_argument("--train", type=str, help="path/to/train") parser.add_argument("--train", type=str, help="path/to/train")
parser.add_argument("--test", type=str, help="path/to/test") parser.add_argument("--test", type=str, help="path/to/test")
parser.add_argument( parser.add_argument(
"--gpu", type=int, default=1, help="gpu number required by one model" "--tp", type=int, default=1, help="tensor parallel"
) )
args = parser.parse_args() args = parser.parse_args()
os.environ["TOKENIZERS_PARALLELISM"] = "false" os.environ["TOKENIZERS_PARALLELISM"] = "false"
apps = load_dataset(args.apps) apps = load_dataset(args.apps)
tokenizer = AutoTokenizer.from_pretrained(args.model)
dataset = transform_to_prompt(apps, tokenizer)
# sampling
sampling_params = SamplingParams(
n=50,
temperature=0.8,
top_p=0.95,
max_tokens=2048,
)
worker = partial( train_raw_path = Path(args.train + ".raw")
generate_worker, model_path=args.model, sampling_params=sampling_params test_raw_path = Path(args.test + ".raw")
)
dataset = model_map(worker, dataset, args.gpu) if not (train_raw_path.exists() and test_raw_path.exists()):
tokenizer = AutoTokenizer.from_pretrained(args.model)
# postprocess dataset = transform_to_prompt(apps, tokenizer)
grouped = defaultdict(list)
for sample in dataset: # sampling
grouped[sample["task_id"]].append(sample) sampling_params = SamplingParams(
n=50,
def is_in_test(task_id): temperature=0.8,
split, idx = task_id.split("-") top_p=0.95,
idx = int(idx) max_tokens=2048,
if split == "test": )
for start, end in [(0, 300), (3000, 3100), (4000, 4100)]:
if start <= idx < end: worker = partial(
return True generate_worker, model_path=args.model, sampling_params=sampling_params
return False )
dataset = model_map(worker, dataset, args.tp)
trainset, testset = [], []
for task_id, group in grouped.items(): # postprocess
target = testset if is_in_test(task_id) else trainset grouped = defaultdict(list)
for idx, sample in enumerate(group): for sample in dataset:
sample["solution_id"] = idx grouped[sample["task_id"]].append(sample)
sample["code"] = extract_code(sample["messages"][-1]["content"])
target.append(sample) def is_in_test(task_id):
split, idx = task_id.split("-")
idx = int(idx)
if split == "test":
for start, end in [(0, 300), (3000, 3100), (4000, 4100)]:
if start <= idx < end:
return True
return False
trainset, testset = [], []
for task_id, group in grouped.items():
target = testset if is_in_test(task_id) else trainset
for idx, sample in enumerate(group):
sample["solution_id"] = idx
sample["code"] = extract_code(sample["messages"][-1]["content"])
target.append(sample)
save_jsonl(trainset, train_raw_path)
save_jsonl(testset, test_raw_path)
else:
trainset = load_jsonl(train_raw_path)
testset = load_jsonl(test_raw_path)
trainset = evaluate(trainset, apps) trainset = evaluate(trainset, apps)
testset = evaluate(testset, apps) testset = evaluate(testset, apps)
......
...@@ -24,7 +24,7 @@ if __name__ == "__main__": ...@@ -24,7 +24,7 @@ if __name__ == "__main__":
help="maximum number of tokens allowed for the reasoning process.", help="maximum number of tokens allowed for the reasoning process.",
) )
parser.add_argument( parser.add_argument(
"--gpu", type=int, default=1, help="gpu number required by one model" "--tp", type=int, default=1, help="tensor parallel"
) )
args = parser.parse_args() args = parser.parse_args()
...@@ -47,7 +47,7 @@ if __name__ == "__main__": ...@@ -47,7 +47,7 @@ if __name__ == "__main__":
worker = partial( worker = partial(
generate_worker, model_path=args.model, sampling_params=sampling_params generate_worker, model_path=args.model, sampling_params=sampling_params
) )
dataset = model_map(worker, dataset, args.gpu) dataset = model_map(worker, dataset, args.tp)
def get_token_id(token): def get_token_id(token):
score_tokens = tokenizer.encode(token, add_special_tokens=False) score_tokens = tokenizer.encode(token, add_special_tokens=False)
......
...@@ -82,7 +82,8 @@ def evaluate_code_samples(code_samples, apps): ...@@ -82,7 +82,8 @@ def evaluate_code_samples(code_samples, apps):
args.append((apps[split][int(idx)], sample)) args.append((apps[split][int(idx)], sample))
cpu_num = multiprocessing.cpu_count() cpu_num = multiprocessing.cpu_count()
chunksize = max(len(code_samples) // (cpu_num * 5), 1) # chunksize = max(len(code_samples) // (cpu_num * 5), 1)
chunksize = 10000
# TODO performance? # TODO performance?
results = process_map( results = process_map(
test_generation, args, max_workers=cpu_num, chunksize=chunksize test_generation, args, max_workers=cpu_num, chunksize=chunksize
...@@ -101,7 +102,7 @@ def evaluate(code_samples, apps): ...@@ -101,7 +102,7 @@ def evaluate(code_samples, apps):
The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result. The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result.
""" """
all_results = [] all_results = []
for _ in range(3): for _ in range(2):
results = evaluate_code_samples(code_samples, apps) results = evaluate_code_samples(code_samples, apps)
all_results.append(results) all_results.append(results)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment