qwq

80ebb3b6 · nzy · 8bf1b27d · 80ebb3b6 · 80ebb3b6
Commit 80ebb3b6 authored Dec 17, 2024 by nzy
Hide whitespace changes
Inline Side-by-side

Showing with 142 additions and 0 deletions

codecritic/cli/run_qwq.py
+30 -0

codecritic/data/cov_with_diff.py
+112 -0

No files found.
--- a/codecritic/cli/run_qwq.py
+++ b/codecritic/cli/run_qwq.py
+import argparse
+from itertools import chain
+
+from codecritic.data.cov_with_diff import (
+    transform_preference_to_qwq_prompt,
+    transform_qwqout_to_trainset
+)
+
+from codecritic.utils.json import load_jsonl, save_jsonl
+from codecritic.utils.vllm import vllm_chatcomplete
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", type=str)
+    parser.add_argument("--preference_dataset", type=str)
+    parser.add_argument("--out", type=str)
+    args = parser.parse_args()
+
+    preference_dataset = load_jsonl(args.preference_dataset)
+    cov_prompts = [transform_preference_to_qwq_prompt(x) for x in preference_dataset]
+    cov_prompts = list(chain(*cov_prompts))
+
+    sampling_params = dict(n=1, temperature=0.0, max_tokens=6144)
+    covs = vllm_chatcomplete(args.model, cov_prompts, sampling_params)
+    save_jsonl(covs, args.out + ".raw")
+    dataset = list(map(transform_qwqout_to_trainset, covs))
+
+    save_jsonl(dataset, args.out)
\ No newline at end of file
--- a/codecritic/data/cov_with_diff.py
+++ b/codecritic/data/cov_with_diff.py
+from difflib import unified_diff
+import re
+
+from codecritic.data.code import extract_code
+from codecritic.data.verify import mk_critic_verify
+
+
+SYS_PROMPT = f"""
+You are an AI code reviewer tasked with analyzing code solutions to programming problems. You will be given a problem description, a code solution, and information about the solution's correctness. If the solution is incorrect, you will also be provided with a diff showing the differences between the given solution and a correct one.
+
+Your task is to analyze the provided code *step-by-step*, pretending you do not know the final verdict of its correctness. Focus on understanding the code's logic, identifying potential issues, and reasoning through its execution.
+
+Output your reasoning process within a markdown code block using the following format:
+
+```rationale
+[Your step-by-step reasoning here. Explain what the code does line by line, identify potential edge cases, and discuss possible errors. Be detailed and thorough.]
+```
+
+Finally, based on your analysis, state your conclusion about the code's correctness (either "Yes" or "No") using the following format:
+
+Final Answer: (Yes or No)
+"""
+
+
+USER_PROMPT = """
+Problem:
+{problem}
+
+Code:
+{code}
+
+Correctness(Yes or No): {correctness}
+
+Diff (Only if Correctness is "No"):
+{diff}
+"""
+
+
+def transform_preference_to_qwq_prompt(item):
+    assert all(len(item[x]) == 1 for x in ["prompt", "chosen", "rejected"])
+    problem = item["prompt"][0]["content"]
+    chosen_code = item["chosen"][0]["content"]
+    rejected_code = item["rejected"][0]["content"]
+
+    diff = unified_diff(
+        extract_code(rejected_code).splitlines(keepends=True),
+        extract_code(chosen_code).splitlines(keepends=True),
+        fromfile="incorrect.py",
+        tofile="correct.py",
+        n=1,
+    )
+
+    sys_message = {"role": "system", "content": SYS_PROMPT}
+    chosen_message = {
+        "role": "user",
+        "content": USER_PROMPT.format(
+            problem=problem, code=chosen_code, correctness="Yes", diff=""
+        ),
+    }
+    rejected_message = {
+        "role": "user",
+        "content": USER_PROMPT.format(
+            problem=problem, code=rejected_code, correctness="No", diff=diff
+        ),
+    }
+
+    return (
+        {
+            "messages": [sys_message, chosen_message],
+            "eval_result": True,
+            "problem_id": item["problem_id"],
+            "raw": item["prompt"] + item["chosen"],
+        },
+        {
+            "messages": [sys_message, rejected_message],
+            "eval_result": False,
+            "problem_id": item["problem_id"],
+            "raw": item["prompt"] + item["rejected"],
+        },
+    )
+
+
+rationale_pattern = re.compile(r"```rationale(.+?)```", flags=re.DOTALL)
+
+
+def extract_rationale(text):
+    rationale = [match.strip() for match in re.findall(rationale_pattern, text)]
+    if len(rationale) < 1:
+        return ""
+    elif len(rationale) > 1:
+        print("warning: multiple rationales")
+        return "\n".join(rationale)
+    else:
+        return rationale[0]
+
+
+def transform_qwqout_to_trainset(item):
+    messages = item["raw"]
+    rationale = extract_rationale(item["messages"][-1])
+
+    messages += [
+        {"role": "user", "content": "Please analyze your code step by step."},
+        {"role": "assistant", "content": rationale},
+    ]
+
+    messages += mk_critic_verify(item["eval_result"])
+
+    return {
+        "messages": messages,
+        "eval_result": item["eval_result"],
+        "problem_id": item["problem_id"],
+    }