transfer to llmkit

eb972b4c · nzy · 432936ac · 432936ac · 432936ac · 432936ac
Commit eb972b4c authored Dec 27, 2024 by nzy
7 changed files
--- a/codecritic/cli/run_hf_score.py
+++ b/codecritic/cli/run_hf_score.py
-import argparse
-import os
-from pathlib import Path
-import pprint
-from tqdm import tqdm
-
-import torch
-import torch.nn.functional as F
-
-import transformers
-import accelerate
-
-from codecritic.utils.json import load_jsonl, save_jsonl
-from codecritic.data.verify import get_score_token_id
-from codecritic.utils.metric import group_results, score_pass_at_k
-
-
-@torch.inference_mode()
-def hf_score(accelerator, model, tokenizer, prompts):
-    score_token = get_score_token_id(tokenizer)
-    with accelerator.split_between_processes(prompts) as partial_prompts:
-        results = []
-
-        for item in tqdm(partial_prompts):
-            input_ids = tokenizer.apply_chat_template(
-                item["messages"], add_generation_prompt=True, return_tensors="pt"
-            ).to("cuda")
-            output = model(input_ids)
-            next_token_logits = output.logits[0, -1, :]
-            score = F.softmax(next_token_logits, dim=0)[score_token].item()
-
-            results.append({**item, "score": score})
-
-    return accelerate.utils.gather_object(results)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str)
-    parser.add_argument("--test", type=str)
-    parser.add_argument("--apps", type=str)
-    args = parser.parse_args()
-
-    home_path = Path(args.model).parent
-    result_dir = home_path / "hf_eval"
-    result_dir.mkdir(exist_ok=True)
-
-    prompts = load_jsonl(args.test)
-
-    # os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-    accelerator = accelerate.Accelerator()
-    tokenizer = transformers.AutoTokenizer.from_pretrained(args.model)
-    model = transformers.AutoModelForCausalLM.from_pretrained(args.model, device_map="auto")
-    # model, tokenizer = accelerator.prepare(model, tokenizer)
-
-    for name, param in model.named_parameters():
-        print(f"{name}: {param.device}")
-    # model.generation_config.pad_token_id = tokenizer.pad_token_id if tokenizer.pad_token_id else tokenizer.eos_token_id
-    # model.generation_config.eos_token_id = tokenizer.eos_token_id
-    model.eval()
-    accelerator.wait_for_everyone()
-
-    results = hf_score(accelerator, model, tokenizer, prompts)
-
-    if accelerator.is_main_process:
-        score_path = result_dir / "scores.jsonl"
-        save_jsonl(results, score_path)
-
-        # compute pass@k
-        eval_result_path = result_dir / "passk.jsonl"
-        # results = load_jsonl(score_path)
-        groups = group_results(results, args.apps)
-        eval_results = [score_pass_at_k(groups, k, home_path.stem) for k in range(1, 16)]
-        save_jsonl(eval_results, eval_result_path)
-        pprint.pp(eval_results)
-
-
-if __name__ == "__main__":
-    main()
--- a/codecritic/cli/sample_apps.py
+++ b/codecritic/cli/sample_apps.py
-import argparse
-from pathlib import Path
-
-from codecritic.utils.json import save_json, save_jsonl
-from codecritic.utils.vllm import vllm_chatcomplete
-from codecritic.sampling.sample_apps import mk_sample_prompt
-from codecritic.sampling.evaluate_code import evaluate
-from codecritic.sampling.sort_split_dataset import sort_and_split_dataset
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str)
-    parser.add_argument("--apps", type=str)
-    parser.add_argument("--output_dir", type=str)
-    args = parser.parse_args()
-
-    output_dir = Path(args.output_dir)
-    output_dir.mkdir(exist_ok=True)
-
-    prompts = mk_sample_prompt(args.model, args.apps)
-
-    sampling_params = dict(n=50, temperature=0.6, max_new_tokens=2048)
-    save_json(sampling_params, output_dir / "sampling_params.json")
-    codes = vllm_chatcomplete(args.model, prompts, sampling_params)
-    save_jsonl(codes, output_dir / "sample.jsonl")
-
-    dataset = evaluate(codes, args.apps)
-    save_jsonl(dataset, output_dir / "dataset.jsonl")
-
-    train, test, min_test = sort_and_split_dataset(dataset, sampling_params["n"])
-    save_jsonl(train, output_dir / "train.jsonl")
-    save_jsonl(test, output_dir / "test.jsonl")
-    save_jsonl(min_test, output_dir / "min_test.jsonl")
--- a/codecritic/sampling/__init__.py
+++ b/codecritic/sampling/__init__.py
--- a/codecritic/sampling/apps_test.py
+++ b/codecritic/sampling/apps_test.py
--- a/codecritic/sampling/evaluate_code.py
+++ b/codecritic/sampling/evaluate_code.py
-# copy from codeparrot/apps_metric/utils.py
-# https://huggingface.co/spaces/codeparrot/apps_metric/blob/main/utils.py
-
-import json
-import multiprocessing
-import numpy as np
-from datasets import load_dataset
-from tqdm.contrib.concurrent import process_map
-
-from codecritic.sampling.apps_test import run_test
-from codecritic.utils.json import save_jsonl
-from codecritic.data.code import extract_code
-
-TIMEOUT = 10
-
-
-def check_correctness(sample, generation, timeout, debug=False):
-    """Check correctness of code generation with a global timeout.
-    The global timeout is to catch some extreme/rare cases not handled by the timeouts
-    inside `run_test`"""
-
-    def _temp_run(sample, generation, debug, result):
-        result.append(run_test(sample, test=generation, debug=debug))
-
-    manager = multiprocessing.Manager()
-    result = manager.list()
-    p = multiprocessing.Process(
-        target=_temp_run, args=(sample, generation, debug, result)
-    )
-    p.start()
-    p.join(timeout=timeout + 1)
-    if p.is_alive():
-        p.kill()
-    if not result:
-        in_outs = json.loads(sample["input_output"])
-        # consider that all tests failed
-        result = [[-1 for i in range(len(in_outs["inputs"]))]]
-        if debug:
-            print(f"global timeout")
-    return result[0]
-
-
-def test_generation(args, debug=False):
-    apps_item, code_sample = args
-    message = code_sample["messages"][-1]
-    assert message["role"] == "assistant"
-    code = extract_code(message["content"])
-
-    curr_res = [-2]
-    try:
-        curr_res = check_correctness(apps_item, code, timeout=TIMEOUT, debug=debug)
-        if debug:
-            print(f"\nSuccessful compilation of task {code}!")
-        fixed = []
-        for e in curr_res:
-            if isinstance(e, np.ndarray):
-                e = e.item(0)
-            if isinstance(e, np.bool_):
-                e = bool(e)
-            fixed.append(e)
-        curr_res = fixed
-        if not np.all(curr_res):
-            if debug:
-                print(curr_res)
-                print(f"Results were not True for all test cases")
-    except Exception as e:
-        if debug:
-            print(f"Compilation failed, test framework exception = {repr(e)}{e}\n")
-    finally:
-        assert isinstance(curr_res, list)
-        problem_result = np.asarray(curr_res)
-
-    code_sample["eval_result"] = bool(np.all(problem_result > 0))
-    code_sample["testcase"] = curr_res
-
-    return code_sample
-
-
-def get_apps_item(item, apps):
-    problem_id = item["problem_id"]
-    split, idx = problem_id.split("_")
-    # get corresponding samples from APPS dataset
-    return apps[split][int(idx)]
-
-
-def evaluate_code_samples(code_samples, apps):
-    args = [(get_apps_item(sample, apps), sample) for sample in code_samples]
-    cpu_num = multiprocessing.cpu_count()
-    chunksize = max(len(code_samples) // (cpu_num * 5), 1)
-    results = process_map(
-        test_generation, args, max_workers=cpu_num, chunksize=chunksize
-    )
-    return results
-
-
-def evaluate_incorrect_code_samples_again(results, apps, loop_num):
-    """
-    There are some strange bugs in apps evaluation that cannot be reproduced.
-    The observable issue is that the same code will yield different 'eval_result' values.
-    Typically, the test framework may encounter an exception or decide that the code has timed out unreasonably.
-
-    This function is an ugly workaround to address this problem:
-    If the function returns a timeout result or raises an exception, it will be run twice to verify if the result is consistent.
-    The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result.
-    """
-    maybe_incorrect_lst, correct_lst = [], []
-    for item in results:
-        if any(x in item["testcase"] for x in (-1, -2)):
-            maybe_incorrect_lst.append(item)
-        else:
-            correct_lst.append(item)
-
-    for _ in range(loop_num):
-        if len(maybe_incorrect_lst) == 0:
-            break
-
-        new_results = evaluate_code_samples(maybe_incorrect_lst, apps)
-        print(f"maybe incorrect lst size: {len(maybe_incorrect_lst)}")
-        check_lst = []
-        for i in range(len(new_results)):
-            old_item, new_item = maybe_incorrect_lst[i], new_results[i]
-            old_eval, new_eval = old_item["eval_result"], new_item["eval_result"]
-            if old_eval == new_eval:
-                correct_lst.append(old_item)
-            else:
-                check_lst.append(new_item)
-                print(old_item["problem_id"], old_eval, new_item["problem_id"], new_eval)
-
-        maybe_incorrect_lst = check_lst
-
-    if len(results) != len(correct_lst):
-        save_jsonl(maybe_incorrect_lst, "debug.jsonl")
-        # raise ValueError("cannot correctly evaluate codes")
-        print("cannot correctly evalute code. see debug.jsonl")
-        if len(maybe_incorrect_lst) < 5:
-            correct_lst.extend(maybe_incorrect_lst)
-
-    return correct_lst
-
-
-def evaluate(code_samples, dataset_path):
-    apps = load_dataset(dataset_path)
-    results = evaluate_code_samples(code_samples, apps)
-    for item in results:
-        item["testcase"] = item["eval_result"]
-        item["eval_result"] = bool(np.all(np.asarray(item["testcase"]) > 0))
-
-    results = evaluate_incorrect_code_samples_again(results, apps, 10)
-    return results
--- a/codecritic/sampling/sample_apps.py
+++ b/codecritic/sampling/sample_apps.py
-from datasets import load_dataset
-import json
-from transformers import AutoTokenizer
-
-
-def mk_prompt(doc) -> str:
-    prompt = "Write Python code to solve competitive programming problems in a markdown code block."
-
-    starter_code = None if len(doc["starter_code"]) == 0 else doc["starter_code"]
-    try:
-        input_outpout = json.loads(doc["input_output"])
-        fn_name = None if not input_outpout.get("fn_name") else input_outpout["fn_name"]
-    except ValueError:
-        fn_name = None
-    prompt += "\nQUESTION:\n"
-    prompt += doc["question"]
-    if starter_code:
-        prompt += starter_code
-    if not fn_name:
-        prompt += "\nUse Standard Input format"
-    else:
-        prompt += "\nUse Call-Based format"
-
-    prompt += "\nPlease generate the code in a ```python markdown block, ensuring to include the closing ``` at the end."
-
-    conversation = [{"role": "user", "content": prompt}]
-    return conversation
-
-
-def mk_sample_prompt(model_path, apps_path):
-    tokenizer = AutoTokenizer.from_pretrained(model_path)
-    tokenizer.pad_token = tokenizer.eos_token
-
-    prompts = []
-    for split in ["train", "test"]:
-        ds = load_dataset(apps_path, split=split)
-        for sample in ds:
-            problem_id = split + "_" + str(sample["problem_id"])
-
-            # Filter problems without input_output
-            try:
-                json.loads(sample["input_output"])
-            except ValueError:
-                print(f"Skipping {problem_id}: Invalid JSON in input_output")
-                continue
-
-            prompt = mk_prompt(sample)
-
-            # Filter long prompts
-            chat_text = tokenizer.apply_chat_template(prompt, tokenize=False)
-            tokenized_prompt = tokenizer.encode(chat_text)
-            if len(tokenized_prompt) > (4096 - 512):
-                print(
-                    f"Skipping {problem_id}: Token length {len(tokenized_prompt)} exceeds limit"
-                )
-                continue
-
-            prompts.append(dict(problem_id=problem_id, messages=prompt))
-
-    return prompts
--- a/codecritic/sampling/sort_split_dataset.py
+++ b/codecritic/sampling/sort_split_dataset.py
-def mk_key_for_sort(item):
-    problem_id = item['problem_id']
-    prefix, idx = problem_id.split('_')
-    prefix_weight = 0 if prefix == 'train' else 1
-    return (prefix_weight, int(idx))
-
-
-def sort_and_drop_key(dataset, key):
-    dataset = sorted(dataset, key=lambda x: x[key])
-    for item in dataset:
-        item.pop(key)
-    return dataset
-
-
-TEST_RANGES = [(0, 300), (3000, 3100), (4000, 4100)]
-def is_in_test_range(prefix_weight, idx):
-    if prefix_weight == 1:
-        for start, end in TEST_RANGES:
-            if start <= idx < end:
-                return True
-    return False
-
-
-def sort_and_split_dataset(dataset, n):
-    """
-    The dataset will be divided into two parts: Train and Test.
-    From the Test set, 10% of items across varying difficulties will be selected.
-    Among these, only those items for which the LLM can generate correct solutions will be included in the minimal testset.
-    This approach reduces the test time by approximately 1/5.
-    """
-    
-    # add `key_for_sort`
-    new_train, new_test = [], []
-    for item in dataset:
-        item["key_for_sort"] = mk_key_for_sort(item)
-        if is_in_test_range(*item["key_for_sort"]):
-            new_test.append(item)
-        else:
-            new_train.append(item)
-    
-    new_train = sort_and_drop_key(new_train, "key_for_sort")
-    new_test = sort_and_drop_key(new_test, "key_for_sort")
-    
-    minimal_test = []
-    assert len(new_test) % n == 0
-    for i in range(len(new_test) // n):
-        problem = new_test[i * n : (i + 1) * n]
-        has_correct_solution = any(d["eval_result"] for d in problem)
-        if has_correct_solution:
-            minimal_test.extend(problem)
-
-    return new_train, new_test, minimal_test