test old models

fd1582c3 · nzy · 91cf4380 · fd1582c3 · fd1582c3 · fd1582c3
Commit fd1582c3 authored Dec 30, 2024 by nzy
Hide whitespace changes
Inline Side-by-side

Showing with 143 additions and 3 deletions

codecritic/cli/test_genrm.py
+2 -1

codecritic/dataset/genrm_prompt.py
+54 -2

codecritic/dataset/legacy_genrm_prompt.py
+87 -0

No files found.
--- a/codecritic/cli/test_genrm.py
+++ b/codecritic/cli/test_genrm.py
@@ -6,6 +6,7 @@ from transformers import AutoTokenizer
 from vllm import SamplingParams
 from codecritic.dataset.genrm_prompt import THINK_MESSAGE, JUDGE_MESSAGE, JUDGE_TOEKNS
+from codecritic.dataset.legacy_genrm_prompt import COV_MESSAGE
 from codecritic.utils.inference import generate_worker, score_worker
 from codecritic.utils.parallel import model_map
 from codecritic.utils.json import load_jsonl, save_jsonl
@@ -35,7 +36,7 @@ if __name__ == "__main__":
    if args.reasoning:
        for item in dataset:
-            item["messages"].append(THINK_MESSAGE)
+            item["messages"].append(COV_MESSAGE)
        sampling_params = SamplingParams(
            n=1,

--- a/codecritic/dataset/genrm_prompt.py
+++ b/codecritic/dataset/genrm_prompt.py
@@ -5,10 +5,62 @@ JUDGE_TOEKNS = {
    "negative": "No"
 }
 def mk_judge_response(response):
    "response: positive or negative"
    return {"role": "assistant", "content": JUDGE_TOEKNS[response]}
-THINK_PROMPT = ""
-THINK_MESSAGE = {"role": "user", "content": THINK_PROMPT}
+SYS_PROMPT = """
+**Role:** You are an experienced code reviewer. Your task is to carefully analyze the provided code, verify its correctness, and provide constructive feedback in a conversational and human-like tone. If you find any errors, inefficiencies, or areas for improvement, point them out clearly and suggest fixes.
+**Instructions:**  
+1. Read and understand the question and the provided code.  
+2. Analyze the code step by step, ensuring each part functions as intended.  
+3. If you find any issues (e.g., logical errors, syntax errors, inefficiencies, or edge cases not handled), explain the problem in a conversational tone and suggest a solution.  
+4. If the code is correct, confirm its correctness and explain why it works as expected.  
+5. Keep the tone natural and avoid overly technical jargon unless necessary.  
+**Format:**  
+- Start by briefly acknowledging the code and its purpose.  
+- For each step or part of the code, write your observations and feedback in a conversational way, as if you were explaining it to a colleague.  
+- Use phrases like “I noticed that…”, “This part looks good because…”, “One thing to consider is…”, or “Here’s a suggestion to improve…”.  
+**Example Input:**  
+**Question:**
+Write a Python function to calculate the factorial of a number.  
+**Answer:**  
+```python  
+def factorial(n):  
+    if n == 0:  
+        return 1  
+    else:  
+        return n * factorial(n - 1)  
+```  
+**Example Output:**  
+Alright, let’s take a look at this factorial function. Overall, it’s a solid implementation, but there are a few things worth discussing.  
+First, I noticed that the base case is handled correctly—when `n == 0`, it returns `1`, which is exactly what we want for a factorial calculation. Good job there!  
+Next, the recursive part of the function looks good too. It correctly calls itself with `n - 1` and multiplies the result by `n`. This will work perfectly for non-negative integers.  
+However, I did notice one potential issue: the function doesn’t handle negative numbers. If someone passes a negative value for `n`, it will keep recursing indefinitely, which could lead to a stack overflow. To make this more robust, I’d suggest adding a check at the beginning of the function to handle negative inputs. For example, you could raise a `ValueError` with a message like ‘Factorial is not defined for negative numbers.’  
+Other than that, this is a clean and correct implementation. Nice work!
+""".strip()
+USER_INPUT = """ 
+**Question:**
+{question}  
+**Answer:**
+{answer}  
+""".strip()
+SYSTEM_MESSAGE = {"role": "system", "content": SYS_PROMPT}
+def mk_user_input(question, answer):
+    return {"role": "user", "content": USER_INPUT.format(question=question, answer=answer)}
--- a/codecritic/dataset/legacy_genrm_prompt.py
+++ b/codecritic/dataset/legacy_genrm_prompt.py
+from codecritic.dataset.code import extract_code, code_template
+from codecritic.data.utils import SPLITTER, mk_message
+from codecritic.dataset.genrm_prompt import mk_judge_response
+COV_PROMPT = "Please verify your code step by step using Markdown code blocks. After each step, explain whether it's correct or not, and if not, explain the issue."
+COV_EXAMPLE = """\
+** Example RETURN FORMAT **
+```python
+def add_numbers(a, b):
+    return a + b
+result = add_numbers(5, '10')
+```
+1. **Code:**
+```python
+def add_numbers(a, b):
+    return a + b
+```
+This defines a function `add_numbers` that takes two arguments and returns their sum. Correct.
+2. **Code:**
+```python
+result = add_numbers(5, '10')
+```
+The second argument is a string (`'10'`), which will cause a TypeError when trying to add it to an integer. Incorrect.
+"""
+CORRECT_PROMPT = "Your code is correct."
+INCORRECT_PROMPT = "Your code is incorrect."
+COV_MESSAGE = {"role": "user", "content": COV_PROMPT}
+def mk_cov_prompt(is_correct, splitter, mode):
+    if mode == "train":
+        anchor = CORRECT_PROMPT if is_correct else INCORRECT_PROMPT
+    elif mode == "test":
+        anchor = ""
+    else:
+         raise ValueError(f"Invalid mode: {mode}. Expected 'train' or 'test'.")
+    turn1 = {"role": "user", "content": '\n'.join([anchor, COV_PROMPT, COV_EXAMPLE])}
+    if splitter:
+        turn2 = {
+            "role": "assistant",
+            "content": "Here's a step-by-step verification of the code." + SPLITTER,
+        }
+        return [turn1, turn2]
+    else:
+        return [turn1]
+def convert_preference_to_vot_prompt(item, splitter, mode):
+    message = item["messages"][0]["content"]
+    chosen = item["chosen"]["content"]
+    rejected = item["rejected"]["content"]
+    chosen = code_template.format(extract_code(chosen))
+    rejected = code_template.format(extract_code(rejected))
+    messages1 = mk_message(message, chosen) + mk_cov_prompt(True, splitter, mode)
+    messages2 = mk_message(message, rejected) + mk_cov_prompt(False, splitter, mode)
+    return (
+        {"messages": messages1, "eval_result": True, "problem_id": item["problem_id"]},
+        {"messages": messages2, "eval_result": False, "problem_id": item["problem_id"]}
+    )
+def convert_sft_to_vot_prompt(item, splitter, mode):
+    question = item["messages"][0]["content"]
+    response = item["messages"][1]["content"]
+    code = code_template.format(extract_code(response))
+    messages = mk_message(question, code) + mk_cov_prompt(item["eval_result"], splitter, mode)
+    return {"messages": messages, "eval_result": item["eval_result"], "problem_id": item["problem_id"]}
+def convert_cov_to_cov_dataset(item, mode):
+    item["messages"][2]["content"] = COV_PROMPT
+    if mode == "train":
+        item["messages"] += mk_critic_verify(item["eval_result"])
+    return item