Commit 251bf975 by nzy

step1: sample code

parents
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
readme.pdf
\ No newline at end of file
model = "/path/to/model"
apps = "/path/to/apps_dataset"
[sample]
sample_prompt_path = "path"
sample_result_path = "path"
[sample.sampling_params]
n = 0
temperature = 0.0
max_new_tokens = 512
[evaluate]
evaluate_result_path = ""
\ No newline at end of file
---
title: "Code Critic"
format: typst
---
## Abstract
LLM-based code verifiers are essential for improving the accuracy of large language models (LLMs) in code generation at test time by filtering out incorrect solutions. While humans can pinpoint bugs and judge the correctness by reasoning through the code, current code verifiers can only make end-to-end binary judgments, which limits their capability to properly verify the code.
In this paper, we present COVER, a novel approach that improves the code verifiers’ ability by integrating them with error localization and reasoning. During training, COVER leverages LLMs to generate diverse code samples and assess their correctness through test cases. It then automatically produces detailed error annotations by comparing incorrect code samples with their most similar correct versions. This comparison enables the LLMs to generate explanatory annotations that highlight both the location and the nature of the errors. During inference, the verifier trained on this enriched dataset—evaluates multiple candidate solutions, not only determining their correctness but also reasoning about potential bugs, ultimately selecting the most accurate solution.
Our experimental results demonstrate that COVER significantly outperforms existing methods. It achieves a test accuracy improvement of X% on [specific benchmark name] and Y% on [another benchmark name], representing a substantial advancement in code verification for LLMs.
## Methods
### Step1 Sample & Evaluate
### Step2 Prepare preference code pairs
## Environment
Same as Llama-factory (Recommand Version)
| Mandatory | Minimum | Recommend |
| ------------ | ------- | --------- |
| python | 3.8 | 3.11 |
| torch | 1.13.1 | 2.4.0 |
| transformers | 4.41.2 | 4.43.4 |
| datasets | 2.16.0 | 2.20.0 |
| accelerate | 0.30.1 | 0.32.0 |
| peft | 0.11.1 | 0.12.0 |
| trl | 0.8.6 | 0.9.6 |
| Optional | Minimum | Recommend |
| ------------ | ------- | --------- |
| CUDA | 11.6 | 12.2 |
| deepspeed | 0.10.0 | 0.14.0 |
| vllm | 0.4.3 | 0.5.0 |
And ***[Difftastic](https://github.com/Wilfred/difftastic)***
```bash
$ cargo install --locked difftastic
```
\ No newline at end of file
# copy from codeparrot/apps_metric/utils.py
# https://huggingface.co/spaces/codeparrot/apps_metric/blob/main/utils.py
import json
import multiprocessing
import numpy as np
from typing import Dict
from datasets import load_dataset
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map
from step1_apps_test import run_test
from utils import extract_code, read_config, load_jsonl, save_jsonl
TIMEOUT = 10
def check_correctness(sample, generation, timeout, debug=False):
"""Check correctness of code generation with a global timeout.
The global timeout is to catch some extreme/rare cases not handled by the timeouts
inside `run_test`"""
def _temp_run(sample, generation, debug, result):
result.append(run_test(sample, test=generation, debug=debug))
manager = multiprocessing.Manager()
result = manager.list()
p = multiprocessing.Process(target=_temp_run, args=(sample, generation, debug, result))
p.start()
p.join(timeout=timeout + 1)
if p.is_alive():
p.kill()
if not result:
in_outs = json.loads(sample["input_output"])
# consider that all tests failed
result = [[-1 for i in range(len(in_outs["inputs"]))]]
if debug:
print(f"global timeout")
return result[0]
def test_generation(args, debug=False):
apps_item, code_sample = args
message = code_sample["messages"][-1]
assert message["role"] == "assistant"
code = extract_code(message["content"])
curr_res = [-2]
try:
curr_res = check_correctness(apps_item, code, timeout=TIMEOUT, debug=debug)
if debug:
print(f"\nSuccessful compilation of task {code}!")
fixed = []
for e in curr_res:
if isinstance(e, np.ndarray):
e = e.item(0)
if isinstance(e, np.bool_):
e = bool(e)
fixed.append(e)
curr_res = fixed
if not np.all(curr_res):
if debug:
print(curr_res)
print(f"Results were not True for all test cases")
except Exception as e:
if debug:
print(f"Compilation failed, test framework exception = {repr(e)}{e}\n")
finally:
assert isinstance(curr_res, list)
problem_results = np.asarray(curr_res)
code_sample["eval_result"] = np.all(problem_results > 0)
return code_sample
def evaluate_code_samples(code_samples: list, dataset_path: str):
apps_eval = load_dataset(dataset_path)
def get_apps_item(item):
problem_id = item["problem_id"]
split, idx = problem_id.split('_')
# get corresponding samples from APPS dataset
return apps_eval[split][int(idx)]
args = [(get_apps_item(sample), sample) for sample in code_samples]
cpu_num = multiprocessing.cpu_count()
results = process_map(test_generation, args, max_workers=cpu_num)
return results
def evaluate(code_sample_path, dataset_path, output_path):
code_samples = load_jsonl(code_sample_path)
results = evaluate_code_samples(code_samples, dataset_path)
save_jsonl(results, output_path)
if __name__ == "__main__":
cfg = read_config()
evaluate(
cfg["sample"]["sample_result_path"],
cfg["apps"],
cfg["evaluate"]["evaluate_result_path"],
)
\ No newline at end of file
from datasets import load_dataset
import json
from utils import read_config, save_jsonl
from transformers import AutoTokenizer
def mk_prompt(doc) -> str:
prompt = "Write Python code to solve competitive programming problems in a markdown code block."
starter_code = None if len(doc["starter_code"]) == 0 else doc["starter_code"]
try:
input_outpout = json.loads(doc["input_output"])
fn_name = None if not input_outpout.get("fn_name") else input_outpout["fn_name"]
except ValueError:
fn_name = None
prompt += "\nQUESTION:\n"
prompt += doc["question"]
if starter_code:
prompt += starter_code
if not fn_name:
prompt += "\nUse Standard Input format"
else:
prompt += "\nUse Call-Based format"
prompt += "\nPlease generate the code in a ```python markdown block, ensuring to include the closing ``` at the end."
conversation = [{"role": "user", "content": prompt}]
return conversation
def mk_sample_prompt(model_path, apps_path, output_path):
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
prompts = []
for split in ["train", "test"]:
ds = load_dataset(apps_path, split=split)
for sample in ds:
problem_id = split + "_" + str(sample["problem_id"])
# Filter problems without input_output
try:
json.loads(sample["input_output"])
except ValueError:
print(f"Skipping {problem_id}: Invalid JSON in input_output")
continue
prompt = mk_prompt(sample)
# Filter long prompts
chat_text = tokenizer.apply_chat_template(prompt, tokenize=False)
tokenized_prompt = tokenizer.encode(chat_text)
if len(tokenized_prompt) > (4096 - 512):
print(
f"Skipping {problem_id}: Token length {len(tokenized_prompt)} exceeds limit"
)
continue
prompts.append(dict(problem_id=problem_id, messages=prompt))
print(f"size of dataset: {len(prompts)}")
save_jsonl(prompts, output_path)
if __name__ == "__main__":
cfg = read_config()
mk_sample_prompt(cfg["model"], cfg["apps"], cfg["sample"]["sample_prompt_path"])
\ No newline at end of file
from utils_vllm import vllm_inference
from utils import read_config
cfg = read_config()
vllm_inference(
cfg["model"],
cfg["sample"]["sample_prompt_path"],
cfg["sample"]["sample_result_path"],
cfg["sample"]["sampling_params"]
)
\ No newline at end of file
from pathlib import Path
import json
import os
import re
from codebleu import calc_codebleu
import tomllib
def load_jsonl(file_path):
with open(file_path, "r", encoding="utf-8") as f:
return [json.loads(line) for line in f]
def load_json(file_path):
with open(file_path, "r", encoding="utf-8") as f:
return json.load(f)
def save_jsonl(data, file_path):
with open(file_path, "w", encoding="utf-8") as f:
for item in data:
f.write(json.dumps(item) + "\n")
codeblock_pattern = re.compile(r"```python(.+?)```", flags=re.DOTALL)
def extract_code(text: str):
codes = [match.strip() for match in re.findall(codeblock_pattern, text)]
if len(codes) > 0:
code = "\n".join(codes)
return code
else:
return ""
def code_similarity(ref, pred):
return calc_codebleu([ref], [pred], lang="python", weights=(0, 0.5, 0.5, 0))
def read_config():
import argparse
argparser = argparse.ArgumentParser()
argparser.add_argument("--config", type=str)
args = argparser.parse_args()
with open(args.config, "rb") as f:
return tomllib.load(f)
\ No newline at end of file
from vllm import LLM, SamplingParams
import os
import multiprocessing
from functools import partial
from utils import load_jsonl, save_jsonl
def worker(cuda_device, prompts, model, sampling_params):
os.environ["CUDA_VISIBLE_DEVICES"] = cuda_device
llm = LLM(model=model, seed=42, max_model_len=8 * 1024)
tokenizer = llm.get_tokenizer()
stop_tokens = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
print(f"SUCCESS: load llm {model} on cuda {cuda_device}")
vllm_sampling_params = SamplingParams(
n=sampling_params['n'],
temperature=sampling_params['temperature'],
top_p=0.95,
max_tokens=sampling_params['max_tokens'],
stop_token_ids=stop_tokens
)
text_prompts = [tokenizer.apply_chat_template(item["messages"], tokenize=False, add_generation_prompt=True) for item in prompts]
outputs = llm.generate(text_prompts, sampling_params=vllm_sampling_params, use_tqdm=False)
for item, output in zip(prompts, outputs):
for response in output.outputs:
generated_text = response.text
item["messages"].append({"role": "assistant", "content": generated_text})
return True
def vllm_inference(model_path, prompt_path, output_path, sampling_params):
prompts = load_jsonl(prompt_path)
# Respect the slurm's gpu allocation
cuda_devices = os.environ["CUDA_VISIBLE_DEVICES"].split(',')
gpu_num = len(cuda_devices)
# split data
sub_prompts = [[] for _ in range(gpu_num)]
for i, prompt in enumerate(prompts):
sub_prompts[i % gpu_num].append(prompt)
args = list(zip(cuda_devices, sub_prompts))
worker_llm = partial(worker, model_path=model_path, sampling_params=sampling_params)
with multiprocessing.Pool(gpu_num) as pool:
status = pool.starmap(worker_llm, args)
print(f"Execution Status: {all(status)}")
save_jsonl(prompts, output_path)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment