Commit 462724df by nanziyuan

fix

parent 09dfbc06
import json
import numpy
import numpy as np
from typing import Any, Dict, List, Tuple, Union
import pandas as pd
SUCCESS = "success"
# unbiased estimator from https://github.com/openai/human-eval
def estimate_pass_at_k(
num_samples: Union[int, List[int], np.ndarray],
num_correct: Union[List[int], np.ndarray],
k: int,
) -> np.ndarray:
"""
Estimates pass@k of each problem and returns them in an array.
"""
def estimator(n: int, c: int, k: int) -> float:
"""
Calculates 1 - comb(n - c, k) / comb(n, k).
"""
if n - c < k:
return 1.0
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
if isinstance(num_samples, int):
num_samples_it = itertools.repeat(num_samples, len(num_correct))
else:
assert len(num_samples) == len(num_correct)
num_samples_it = iter(num_samples)
return np.array(
[estimator(int(n), int(c), k) for n, c in zip(num_samples_it, num_correct)]
)
def count(path):
with open(path, "r") as f:
results = json.load(f)
total = np.array([r["nfiles"] for r in results["eval"].values()])
base_correct = []
new_correct = []
for res in results["eval"].values():
bc = sum([r[0] == SUCCESS for r in res["base"]])
base_correct.append(bc)
if res["plus"]:
new_correct.append(
sum(
[
res["plus"][i][0] == res["base"][i][0] == SUCCESS
for i in range(len(res["plus"]))
]
)
)
base_correct = np.array(base_correct)
pass_at_k = [
{"k": k, "pass@k": estimate_pass_at_k(total, base_correct, k).mean()}
for k in range(1, total.min() + 1)
if total.min() >= k
]
with open("result.jsonl", "w") as f:
for r in pass_at_k:
f.write(json.dumps(r) + '\n')
import pandas as pd
from pathlib import Path
import logging
import json
from tqdm import tqdm
from transformers import AutoTokenizer
from dataclasses import dataclass
codellamas_path = Path("/lustre/models/CodeLlama_240112/")
models_path = {
"7b": "codellama-CodeLlama-7b-hf",
"13b": "codellama-CodeLlama-13b-hf",
"34b": "CodeLlama-34b-hf",
}
CODE_EOS = 28956 # '```' in codellama's tokenizer
EOS = 2 # default eos
STOP_TOKENS = [CODE_EOS, EOS]
def mk_prompt(question: str, answer: str) -> str:
return question + answer
def count_token(encode, cache, r):
program = r["completion"]
question = cache[r["task_id"]]
return len(encode(mk_prompt(question, program)))
def main(model_name, sample_path):
model_path = codellamas_path / models_path[model_name]
tokenizer = AutoTokenizer.from_pretrained(model_path)
with open("../testsets/humaneval/humaneval_prompt.jsonl", "r") as f:
lines = [json.loads(l) for l in f.readlines()]
question_cache = {l["task_id"]: l["prompt"] for l in lines}
programs = pd.read_json(sample_path, lines=True)
count_token_p = lambda r: count_token(tokenizer.encode, question_cache, r)
programs["token_num"] = programs.apply(count_token_p, axis=1)
print(programs["token_num"].mean())
if __name__ == "__main__":
import argparse
argparser = argparse.ArgumentParser()
argparser.add_argument("name", type=str)
argparser.add_argument("path", type=str)
args = argparser.parse_args()
logging.basicConfig(format="%(asctime)s %(name)s:%(levelname)s:%(message)s",
datefmt="%d-%M-%Y %H:%M:%S",
encoding='utf-8',
level=logging.INFO)
logging.info(f"START: Testing codellama-{args.name}")
main(args.name, args.path)
...@@ -54,7 +54,7 @@ echo $(which python3) ...@@ -54,7 +54,7 @@ echo $(which python3)
cluster-quota # nas quota cluster-quota # nas quota
#- Job step #- Job step
python /lustre/S/nanziyuan/projects/codellama_test/src/eval.py -n {name} -s {nstart} -e {nend} {test_program}
#- End #- End
echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")" echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
...@@ -81,7 +81,7 @@ def mk_config(llm_name, testset_name, exp_id): ...@@ -81,7 +81,7 @@ def mk_config(llm_name, testset_name, exp_id):
completion_dir=completion_dir.as_posix(), completion_dir=completion_dir.as_posix(),
k=1000, k=1000,
num_gpus=num_gpus, num_gpus=num_gpus,
stop=["def"], stop=["\nclass", "\ndef", "\n#", "\n@", "\nprint", "\nif", "\n```"],
) )
cfg_path = path / "config.json" cfg_path = path / "config.json"
...@@ -90,4 +90,4 @@ def mk_config(llm_name, testset_name, exp_id): ...@@ -90,4 +90,4 @@ def mk_config(llm_name, testset_name, exp_id):
mk_slurm(llm_name, num_gpus, cfg_path, slurm_dir) mk_slurm(llm_name, num_gpus, cfg_path, slurm_dir)
if __name__ == "__main__": if __name__ == "__main__":
mk_config("70b_base", "apps_subset", 0) mk_config("7b_base", "humaneval", 1)
...@@ -11,8 +11,25 @@ def merge_jsonl(completion_path, target_path): ...@@ -11,8 +11,25 @@ def merge_jsonl(completion_path, target_path):
with open(f, "r") as infile: with open(f, "r") as infile:
outfile.write(infile.read()) outfile.write(infile.read())
def mk_test_slurm(exp_path): def mk_eval(exp_path):
name = Path(exp_path).stem exp_path = Path(exp_path)
name = exp_path.stem
eval_path = EXP / f"{name}-test" eval_path = EXP / f"{name}-test"
eval_path.mkdir()
completion_path = exp_path / "completion" completion_path = exp_path / "completion"
merge_jsonl(completion_path, eval_path) merge_jsonl(completion_path, eval_path)
testset = name.split('-')[1]
if testset == "humaneval":
test_program = [f"cd {eval_path.absolute()}",
"evalplus.evaluate --dataset humaneval --samples samples.jsonl"]
else:
raise NotImplementedError(f"Unknown testset: {testset})")
test_program = "\n".join(test_program)
with open("cpu.slurm.template", "r") as f:
template = f.read()
with open(eval_path / "cpu.slurm", "w") as f:
f.write(template.format(name=name, test_program=test_program))
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment