Commit 91cf4380 by nzy

improve eval_apps

parent b319f162
# CodeCritic
## Installation
```
pip install scikit-learn
pip install
```
## Evaluation
### APPS-500 (Top@k)
......
import argparse
from pathlib import Path
import os
import json
from functools import partial
......@@ -14,7 +15,7 @@ from codecritic.dataset.code import extract_code
from codecritic.evaluation.apps_eval import evaluate
from codecritic.utils.inference import generate_worker
from codecritic.utils.parallel import model_map
from codecritic.utils.json import save_jsonl
from codecritic.utils.json import load_jsonl, save_jsonl
def transform_to_prompt(apps, tokenizer):
......@@ -56,49 +57,60 @@ if __name__ == "__main__":
parser.add_argument("--train", type=str, help="path/to/train")
parser.add_argument("--test", type=str, help="path/to/test")
parser.add_argument(
"--gpu", type=int, default=1, help="gpu number required by one model"
"--tp", type=int, default=1, help="tensor parallel"
)
args = parser.parse_args()
os.environ["TOKENIZERS_PARALLELISM"] = "false"
apps = load_dataset(args.apps)
tokenizer = AutoTokenizer.from_pretrained(args.model)
dataset = transform_to_prompt(apps, tokenizer)
# sampling
sampling_params = SamplingParams(
n=50,
temperature=0.8,
top_p=0.95,
max_tokens=2048,
)
worker = partial(
generate_worker, model_path=args.model, sampling_params=sampling_params
)
dataset = model_map(worker, dataset, args.gpu)
# postprocess
grouped = defaultdict(list)
for sample in dataset:
grouped[sample["task_id"]].append(sample)
def is_in_test(task_id):
split, idx = task_id.split("-")
idx = int(idx)
if split == "test":
for start, end in [(0, 300), (3000, 3100), (4000, 4100)]:
if start <= idx < end:
return True
return False
trainset, testset = [], []
for task_id, group in grouped.items():
target = testset if is_in_test(task_id) else trainset
for idx, sample in enumerate(group):
sample["solution_id"] = idx
sample["code"] = extract_code(sample["messages"][-1]["content"])
target.append(sample)
train_raw_path = Path(args.train + ".raw")
test_raw_path = Path(args.test + ".raw")
if not (train_raw_path.exists() and test_raw_path.exists()):
tokenizer = AutoTokenizer.from_pretrained(args.model)
dataset = transform_to_prompt(apps, tokenizer)
# sampling
sampling_params = SamplingParams(
n=50,
temperature=0.8,
top_p=0.95,
max_tokens=2048,
)
worker = partial(
generate_worker, model_path=args.model, sampling_params=sampling_params
)
dataset = model_map(worker, dataset, args.tp)
# postprocess
grouped = defaultdict(list)
for sample in dataset:
grouped[sample["task_id"]].append(sample)
def is_in_test(task_id):
split, idx = task_id.split("-")
idx = int(idx)
if split == "test":
for start, end in [(0, 300), (3000, 3100), (4000, 4100)]:
if start <= idx < end:
return True
return False
trainset, testset = [], []
for task_id, group in grouped.items():
target = testset if is_in_test(task_id) else trainset
for idx, sample in enumerate(group):
sample["solution_id"] = idx
sample["code"] = extract_code(sample["messages"][-1]["content"])
target.append(sample)
save_jsonl(trainset, train_raw_path)
save_jsonl(testset, test_raw_path)
else:
trainset = load_jsonl(train_raw_path)
testset = load_jsonl(test_raw_path)
trainset = evaluate(trainset, apps)
testset = evaluate(testset, apps)
......
......@@ -24,7 +24,7 @@ if __name__ == "__main__":
help="maximum number of tokens allowed for the reasoning process.",
)
parser.add_argument(
"--gpu", type=int, default=1, help="gpu number required by one model"
"--tp", type=int, default=1, help="tensor parallel"
)
args = parser.parse_args()
......@@ -47,7 +47,7 @@ if __name__ == "__main__":
worker = partial(
generate_worker, model_path=args.model, sampling_params=sampling_params
)
dataset = model_map(worker, dataset, args.gpu)
dataset = model_map(worker, dataset, args.tp)
def get_token_id(token):
score_tokens = tokenizer.encode(token, add_special_tokens=False)
......
......@@ -82,7 +82,8 @@ def evaluate_code_samples(code_samples, apps):
args.append((apps[split][int(idx)], sample))
cpu_num = multiprocessing.cpu_count()
chunksize = max(len(code_samples) // (cpu_num * 5), 1)
# chunksize = max(len(code_samples) // (cpu_num * 5), 1)
chunksize = 10000
# TODO performance?
results = process_map(
test_generation, args, max_workers=cpu_num, chunksize=chunksize
......@@ -101,7 +102,7 @@ def evaluate(code_samples, apps):
The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result.
"""
all_results = []
for _ in range(3):
for _ in range(2):
results = evaluate_code_samples(code_samples, apps)
all_results.append(results)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment