Commit fd1582c3 by nzy

test old models

parent 91cf4380
...@@ -6,6 +6,7 @@ from transformers import AutoTokenizer ...@@ -6,6 +6,7 @@ from transformers import AutoTokenizer
from vllm import SamplingParams from vllm import SamplingParams
from codecritic.dataset.genrm_prompt import THINK_MESSAGE, JUDGE_MESSAGE, JUDGE_TOEKNS from codecritic.dataset.genrm_prompt import THINK_MESSAGE, JUDGE_MESSAGE, JUDGE_TOEKNS
from codecritic.dataset.legacy_genrm_prompt import COV_MESSAGE
from codecritic.utils.inference import generate_worker, score_worker from codecritic.utils.inference import generate_worker, score_worker
from codecritic.utils.parallel import model_map from codecritic.utils.parallel import model_map
from codecritic.utils.json import load_jsonl, save_jsonl from codecritic.utils.json import load_jsonl, save_jsonl
...@@ -35,7 +36,7 @@ if __name__ == "__main__": ...@@ -35,7 +36,7 @@ if __name__ == "__main__":
if args.reasoning: if args.reasoning:
for item in dataset: for item in dataset:
item["messages"].append(THINK_MESSAGE) item["messages"].append(COV_MESSAGE)
sampling_params = SamplingParams( sampling_params = SamplingParams(
n=1, n=1,
......
...@@ -5,10 +5,62 @@ JUDGE_TOEKNS = { ...@@ -5,10 +5,62 @@ JUDGE_TOEKNS = {
"negative": "No" "negative": "No"
} }
def mk_judge_response(response): def mk_judge_response(response):
"response: positive or negative" "response: positive or negative"
return {"role": "assistant", "content": JUDGE_TOEKNS[response]} return {"role": "assistant", "content": JUDGE_TOEKNS[response]}
THINK_PROMPT = ""
THINK_MESSAGE = {"role": "user", "content": THINK_PROMPT} SYS_PROMPT = """
**Role:** You are an experienced code reviewer. Your task is to carefully analyze the provided code, verify its correctness, and provide constructive feedback in a conversational and human-like tone. If you find any errors, inefficiencies, or areas for improvement, point them out clearly and suggest fixes.
**Instructions:**
1. Read and understand the question and the provided code.
2. Analyze the code step by step, ensuring each part functions as intended.
3. If you find any issues (e.g., logical errors, syntax errors, inefficiencies, or edge cases not handled), explain the problem in a conversational tone and suggest a solution.
4. If the code is correct, confirm its correctness and explain why it works as expected.
5. Keep the tone natural and avoid overly technical jargon unless necessary.
**Format:**
- Start by briefly acknowledging the code and its purpose.
- For each step or part of the code, write your observations and feedback in a conversational way, as if you were explaining it to a colleague.
- Use phrases like “I noticed that…”, “This part looks good because…”, “One thing to consider is…”, or “Here’s a suggestion to improve…”.
**Example Input:**
**Question:**
Write a Python function to calculate the factorial of a number.
**Answer:**
```python
def factorial(n):
if n == 0:
return 1
else:
return n * factorial(n - 1)
```
**Example Output:**
Alright, let’s take a look at this factorial function. Overall, it’s a solid implementation, but there are a few things worth discussing.
First, I noticed that the base case is handled correctly—when `n == 0`, it returns `1`, which is exactly what we want for a factorial calculation. Good job there!
Next, the recursive part of the function looks good too. It correctly calls itself with `n - 1` and multiplies the result by `n`. This will work perfectly for non-negative integers.
However, I did notice one potential issue: the function doesn’t handle negative numbers. If someone passes a negative value for `n`, it will keep recursing indefinitely, which could lead to a stack overflow. To make this more robust, I’d suggest adding a check at the beginning of the function to handle negative inputs. For example, you could raise a `ValueError` with a message like ‘Factorial is not defined for negative numbers.’
Other than that, this is a clean and correct implementation. Nice work!
""".strip()
USER_INPUT = """
**Question:**
{question}
**Answer:**
{answer}
""".strip()
SYSTEM_MESSAGE = {"role": "system", "content": SYS_PROMPT}
def mk_user_input(question, answer):
return {"role": "user", "content": USER_INPUT.format(question=question, answer=answer)}
from codecritic.dataset.code import extract_code, code_template
from codecritic.data.utils import SPLITTER, mk_message
from codecritic.dataset.genrm_prompt import mk_judge_response
COV_PROMPT = "Please verify your code step by step using Markdown code blocks. After each step, explain whether it's correct or not, and if not, explain the issue."
COV_EXAMPLE = """\
** Example RETURN FORMAT **
```python
def add_numbers(a, b):
return a + b
result = add_numbers(5, '10')
```
1. **Code:**
```python
def add_numbers(a, b):
return a + b
```
This defines a function `add_numbers` that takes two arguments and returns their sum. Correct.
2. **Code:**
```python
result = add_numbers(5, '10')
```
The second argument is a string (`'10'`), which will cause a TypeError when trying to add it to an integer. Incorrect.
"""
CORRECT_PROMPT = "Your code is correct."
INCORRECT_PROMPT = "Your code is incorrect."
COV_MESSAGE = {"role": "user", "content": COV_PROMPT}
def mk_cov_prompt(is_correct, splitter, mode):
if mode == "train":
anchor = CORRECT_PROMPT if is_correct else INCORRECT_PROMPT
elif mode == "test":
anchor = ""
else:
raise ValueError(f"Invalid mode: {mode}. Expected 'train' or 'test'.")
turn1 = {"role": "user", "content": '\n'.join([anchor, COV_PROMPT, COV_EXAMPLE])}
if splitter:
turn2 = {
"role": "assistant",
"content": "Here's a step-by-step verification of the code." + SPLITTER,
}
return [turn1, turn2]
else:
return [turn1]
def convert_preference_to_vot_prompt(item, splitter, mode):
message = item["messages"][0]["content"]
chosen = item["chosen"]["content"]
rejected = item["rejected"]["content"]
chosen = code_template.format(extract_code(chosen))
rejected = code_template.format(extract_code(rejected))
messages1 = mk_message(message, chosen) + mk_cov_prompt(True, splitter, mode)
messages2 = mk_message(message, rejected) + mk_cov_prompt(False, splitter, mode)
return (
{"messages": messages1, "eval_result": True, "problem_id": item["problem_id"]},
{"messages": messages2, "eval_result": False, "problem_id": item["problem_id"]}
)
def convert_sft_to_vot_prompt(item, splitter, mode):
question = item["messages"][0]["content"]
response = item["messages"][1]["content"]
code = code_template.format(extract_code(response))
messages = mk_message(question, code) + mk_cov_prompt(item["eval_result"], splitter, mode)
return {"messages": messages, "eval_result": item["eval_result"], "problem_id": item["problem_id"]}
def convert_cov_to_cov_dataset(item, mode):
item["messages"][2]["content"] = COV_PROMPT
if mode == "train":
item["messages"] += mk_critic_verify(item["eval_result"])
return item
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment