Commit a4c9cab3 by nzy

rename data to dataset & prepare apps refactor

parent 6d68e66f
# Additional Experiment:
# As we know, there are two primary methods for training a reward model:
# 1. Using reward loss
# 2. Using SFT (Supervised Fine-Tuning) directly
# This experiment aims to fairly compare these two approaches.
import argparse
from codecritic.data.utils import mk_message, save_jsonl_dataset
from codecritic.dataset.utils import mk_message, save_jsonl_dataset
from codecritic.utils.json import load_jsonl
from codecritic.data.verify import mk_critic_verify
from codecritic.dataset.verify import mk_critic_verify
def mk_sft_dataset(messages):
......@@ -20,8 +15,6 @@ def convert_preference_to_sft(item):
chosen = item["chosen"]
rejected = item["rejected"]
messages1 = message + chosen + mk_critic_verify(True)
messages2 = message + rejected + mk_critic_verify(False)
return mk_sft_dataset(messages1), mk_sft_dataset(messages2)
......
from codecritic.evaluation.metric import group_results, score_pass_at_k
def eval():
# compute pass@k
eval_result_path = result_dir / "passk.jsonl"
# results = load_jsonl(score_path)
groups = group_results(results, apps_path)
eval_results = [score_pass_at_k(groups, k, home_path.stem) for k in range(1, 16)]
save_jsonl(eval_results, eval_result_path)
pprint.pp(eval_results)
\ No newline at end of file
import argparse
from itertools import chain
from codecritic.data.cov import (
from codecritic.dataset.cov import (
convert_preference_to_vot_prompt,
convert_cov_to_cov_dataset,
)
from codecritic.utils.json import load_json
from codecritic.data.utils import save_jsonl_dataset
from codecritic.dataset.utils import save_jsonl_dataset
from codecritic.utils.vllm import vllm_chatcomplete
......
......@@ -8,7 +8,7 @@ from threading import Lock
from openai import OpenAI
import json
from codecritic.data.cov import (
from codecritic.dataset.cov import (
convert_preference_to_vot_prompt,
convert_sft_to_vot_prompt,
convert_cov_to_cov_dataset,
......
import argparse
from pathlib import Path
from codecritic.utils.json import load_json
from codecritic.data.utils import save_jsonl_dataset
from codecritic.dataset.utils import save_jsonl_dataset
from codecritic.data.edit_distance import (
from codecritic.dataset.edit_distance import (
mk_problem_groups,
calculate_edit_distances,
mk_edit_distance_dataset,
......
import argparse
from pathlib import Path
import random
from codecritic.utils.json import load_jsonl, save_jsonl
def add_mask_and_score(messages):
for idx, turn in enumerate(messages):
if idx != 3:
turn["mask"] = True
else:
turn["mask"] = False
if idx == 5:
turn["score"] = True
return messages
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--path", type=str)
parser.add_argument("--split", action="store_true")
args = parser.parse_args()
dataset = load_jsonl(args.path)
for item in dataset:
item["messages"] = add_mask_and_score(item["messages"])
if args.split:
random.shuffle(dataset)
split_len = int(len(dataset) * 0.01)
test = dataset[:split_len]
train = dataset[split_len:]
dataset_path = Path(args.path).parent
save_jsonl(train, dataset_path / "train.jsonl")
save_jsonl(test, dataset_path / "test.jsonl")
else:
save_jsonl(dataset, args.path)
import argparse
from itertools import chain
from codecritic.data.cov_with_diff import (
transform_preference_to_qwq_prompt,
transform_qwqout_to_trainset
)
from codecritic.utils.json import load_jsonl, save_jsonl
from codecritic.utils.vllm import vllm_chatcomplete
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model", type=str)
parser.add_argument("--preference_dataset", type=str)
parser.add_argument("--out", type=str)
args = parser.parse_args()
preference_dataset = load_jsonl(args.preference_dataset)
cov_prompts = [transform_preference_to_qwq_prompt(x) for x in preference_dataset]
cov_prompts = list(chain(*cov_prompts))
sampling_params = dict(n=1, temperature=0, max_tokens=6144)
covs = vllm_chatcomplete(args.model, cov_prompts, sampling_params, 2)
# save_jsonl(covs, args.out + ".raw")
dataset = list(map(transform_qwqout_to_trainset, covs))
save_jsonl(dataset, args.out)
......@@ -5,7 +5,7 @@ import os
from transformers import AutoTokenizer
from vllm import SamplingParams
from codecritic.data.genrm_prompt import THINK_PROMPT, JUDGE_PROMPT, JUDGE_TOEKNS
from codecritic.dataset.genrm_prompt import THINK_PROMPT, JUDGE_PROMPT, JUDGE_TOEKNS
from codecritic.evaluation.inference import generate_worker, score_worker
from codecritic.utils.parallel import model_map
from codecritic.utils.json import load_jsonl, save_jsonl
......
import json
def mk_prompt(doc):
prompt = "Write Python code to solve competitive programming problems in a markdown code block."
starter_code = None if len(doc["starter_code"]) == 0 else doc["starter_code"]
try:
input_outpout = json.loads(doc["input_output"])
fn_name = None if not input_outpout.get("fn_name") else input_outpout["fn_name"]
except ValueError:
fn_name = None
prompt += "\nQUESTION:\n"
prompt += doc["question"]
if starter_code:
prompt += starter_code
if not fn_name:
prompt += "\nUse Standard Input format"
else:
prompt += "\nUse Call-Based format"
prompt += "\nPlease generate the code in a ```python markdown block, ensuring to include the closing ``` at the end."
conversation = [{"role": "user", "content": prompt}]
return conversation
# Additional Experiment:
# Is reasoning really work? Let's verify step by step.
from codecritic.data.code import extract_code, code_template
from codecritic.dataset.code import extract_code, code_template
from codecritic.data.utils import SPLITTER, mk_message
from codecritic.data.verify import mk_critic_verify
from codecritic.dataset.utils import SPLITTER, mk_message
from codecritic.dataset.verify import mk_critic_verify
COV_PROMPT = "Please verify your code step by step using Markdown code blocks. After each step, explain whether it's correct or not, and if not, explain the issue."
......
from difflib import unified_diff
import re
from codecritic.data.code import extract_code
from codecritic.data.verify import mk_critic_verify
from codecritic.dataset.code import extract_code
from codecritic.dataset.verify import mk_critic_verify
# QwQ doesn't follow my instruction, but output *really* reasonable explanation
......
from codecritic.utils.json import load_jsonl
from codecritic.data.code import extract_code, code_template
from codecritic.dataset.code import extract_code, code_template
from nltk.metrics.distance import edit_distance
from collections import defaultdict
from itertools import product, chain
......
# copy from codeparrot/apps_metric/utils.py
# https://huggingface.co/spaces/codeparrot/apps_metric/blob/main/utils.py
import json
import multiprocessing
import numpy as np
from tqdm.contrib.concurrent import process_map
from codecritic.evaluation.apps_exec import run_test
from codecritic.utils.json import save_jsonl
from codecritic.dataset.code import extract_code
TIMEOUT = 10
def check_correctness(sample, generation, timeout, debug=False):
"""Check correctness of code generation with a global timeout.
The global timeout is to catch some extreme/rare cases not handled by the timeouts
inside `run_test`"""
def _temp_run(sample, generation, debug, result):
result.append(run_test(sample, test=generation, debug=debug))
manager = multiprocessing.Manager()
result = manager.list()
p = multiprocessing.Process(
target=_temp_run, args=(sample, generation, debug, result)
)
p.start()
p.join(timeout=timeout + 1)
if p.is_alive():
p.kill()
if not result:
in_outs = json.loads(sample["input_output"])
# consider that all tests failed
result = [[-1 for i in range(len(in_outs["inputs"]))]]
if debug:
print(f"global timeout")
return result[0]
def test_generation(args, debug=False):
apps_item, sample = args
code = extract_code(sample["response"][0]["content"])
curr_res = [-2]
try:
curr_res = check_correctness(apps_item, code, timeout=TIMEOUT, debug=debug)
if debug:
print(f"\nSuccessful compilation of task {code}!")
fixed = []
for e in curr_res:
if isinstance(e, np.ndarray):
e = e.item(0)
if isinstance(e, np.bool_):
e = bool(e)
fixed.append(e)
curr_res = fixed
if not np.all(curr_res):
if debug:
print(curr_res)
print(f"Results were not True for all test cases")
except Exception as e:
if debug:
print(f"Compilation failed, test framework exception = {repr(e)}{e}\n")
finally:
assert isinstance(curr_res, list)
problem_result = np.asarray(curr_res)
return {
**sample,
"code": code,
"eval_result": bool(np.all(problem_result > 0)),
"testcase": curr_res
}
def evaluate_code_samples(code_samples, apps):
args = []
for sample in code_samples:
problem_id = sample["problem_id"]
args.append((apps["test"][int(problem_id)], sample))
cpu_num = multiprocessing.cpu_count()
chunksize = max(len(code_samples) // (cpu_num * 5), 1)
# TODO performance?
results = process_map(
test_generation, args, max_workers=cpu_num, chunksize=chunksize
)
return results
def evaluate_incorrect_code_samples_again(results, apps, loop_num):
"""
There are some strange bugs in apps evaluation that cannot be reproduced.
The observable issue is that the same code will yield different 'eval_result' values.
Typically, the test framework may encounter an exception or decide that the code has timed out unreasonably.
This function is an ugly workaround to address this problem:
Run twice to verify if the result is consistent.
The 'loop_num' parameter controls the number of times the function will be retried until the test framework obtains a consistent result.
"""
maybe_incorrect_lst, correct_lst = [], []
for item in results:
if any(x in item["testcase"] for x in (-1, -2)):
maybe_incorrect_lst.append(item)
else:
correct_lst.append(item)
for _ in range(loop_num):
if len(maybe_incorrect_lst) == 0:
break
new_results = evaluate_code_samples(maybe_incorrect_lst, apps)
print(f"maybe incorrect lst size: {len(maybe_incorrect_lst)}")
check_lst = []
for i in range(len(new_results)):
old_item, new_item = maybe_incorrect_lst[i], new_results[i]
old_eval, new_eval = old_item["eval_result"], new_item["eval_result"]
if old_eval == new_eval:
correct_lst.append(old_item)
else:
check_lst.append(new_item)
print(old_item["problem_id"], old_eval, new_item["problem_id"], new_eval)
maybe_incorrect_lst = check_lst
if len(results) != len(correct_lst):
save_jsonl(maybe_incorrect_lst, "debug.jsonl")
# raise ValueError("cannot correctly evaluate codes")
print("cannot correctly evalute code. see debug.jsonl")
if len(maybe_incorrect_lst) < 5:
correct_lst.extend(maybe_incorrect_lst)
return correct_lst
def evaluate(code_samples, apps):
results = evaluate_code_samples(code_samples, apps)
results = evaluate_incorrect_code_samples_again(results, apps, 10)
return results
import numpy as np
from datasets import load_dataset
from collections import defaultdict
def estimate_pass_at_k(
num_samples: list[int], num_correct: list[int], k: int
) -> np.ndarray:
"""
Estimates pass@k of each problem and returns them in an array.
"""
def estimator(n: int, c: int, k: int) -> float:
"""
Calculates 1 - comb(n - c, k) / comb(n, k).
"""
if n - c < k:
return 1.0
return 1.0 - np.prod(1.0 - k / np.arange(n - c + 1, n + 1))
return np.array(
[estimator(int(n), int(c), k) for n, c in zip(num_samples, num_correct)]
)
def group_results(results, apps_path):
"""
Output
{
"interview": [
problem_id: [
{"problem_id", problem_id, "eval_result": True, ... },
...
],
...
],
...
}
"""
dataset = load_dataset(apps_path)
groups = defaultdict(lambda: defaultdict(list))
for item in results:
problem_id = item["problem_id"]
split, idx = problem_id.split("_")
difficulty = dataset[split][int(idx)]["difficulty"]
groups[difficulty][problem_id].append(item)
if "score" in results[0]:
for difficulty, problem in groups.items():
for problem_id, lst in problem.items():
sorted_lst = sorted(lst, key=lambda x: x["score"], reverse=True)
problem[problem_id] = sorted_lst
return groups
def pass_at_k(groups, k):
result = {"strategy": "pass@k", "k": k}
for difficulty, problems in groups.items():
num_samples, num_correct = [], []
for lst in problems.values():
num_samples.append(len(lst))
num_correct.append(sum(item["eval_result"] for item in lst))
pass_at_k = np.mean(estimate_pass_at_k(num_samples, num_correct, k))
result[difficulty] = pass_at_k
return result
def score_pass_at_k(groups, k, strategy):
result = {"strategy": strategy, "k": k}
for difficulty, problems in groups.items():
num_samples, num_correct = 0, 0
for lst in problems.values():
num_samples += 1
num_correct += any(item["eval_result"] for item in lst[:k])
pass_at_k = num_correct / num_samples
result[difficulty] = pass_at_k
return result
......@@ -4,6 +4,8 @@ format: typst
bibliography: refs.bib
---
TODO Rewrite Readme
## Abstract
LLM-based code verifiers are essential for improving the accuracy of large language models (LLMs) in code generation at test time by filtering out incorrect solutions. While humans can pinpoint bugs and judge the correctness by reasoning through the code, current code verifiers can only make end-to-end binary judgments, which limits their capability to properly verify the code.
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment