add qwq & fix vllm bugs

432936ac · nanziyuan · 0e43e25c · 432936ac · 432936ac · 432936ac
Commit 432936ac authored Dec 27, 2024 by nanziyuan
Hide whitespace changes
Inline Side-by-side

Showing with 30 additions and 27 deletions

codecritic/cli/run_qwq.py
+4 -5

codecritic/data/cov_with_diff.py
+18 -17

codecritic/utils/vllm.py
+8 -5

No files found.
--- a/codecritic/cli/run_qwq.py
+++ b/codecritic/cli/run_qwq.py
@@ -21,9 +21,9 @@ if __name__ == "__main__":
    cov_prompts = [transform_preference_to_qwq_prompt(x) for x in preference_dataset]
    cov_prompts = list(chain(*cov_prompts))

-    sampling_params = dict(n=1, temperature=0.0, max_tokens=6144)
-    covs = vllm_chatcomplete(args.model, cov_prompts, sampling_params)
-    save_jsonl(covs, args.out + ".raw")
+    sampling_params = dict(n=1, temperature=0, max_tokens=6144)
+    covs = vllm_chatcomplete(args.model, cov_prompts, sampling_params, 2)
+    # save_jsonl(covs, args.out + ".raw")
    dataset = list(map(transform_qwqout_to_trainset, covs))

-    save_jsonl(dataset, args.out)
\ No newline at end of file
+    save_jsonl(dataset, args.out)
--- a/codecritic/data/cov_with_diff.py
+++ b/codecritic/data/cov_with_diff.py
@@ -4,21 +4,24 @@ import re
 from codecritic.data.code import extract_code
 from codecritic.data.verify import mk_critic_verify

+# QwQ doesn't follow my instruction, but output *really* reasonable explanation

 SYS_PROMPT = f"""
 You are an AI code reviewer tasked with analyzing code solutions to programming problems. You will be given a problem description, a code solution, and information about the solution's correctness. If the solution is incorrect, you will also be provided with a diff showing the differences between the given solution and a correct one.

-Your task is to analyze the provided code *step-by-step*, pretending you do not know the final verdict of its correctness. Focus on understanding the code's logic, identifying potential issues, and reasoning through its execution.
+Your task is to analyze the provided code *step-by-step*, reasoning through its logic and identifying potential issues. Initially, approach the analysis as if you don't know the final judgement of its correctness. However, your final conclusion about the code's correctness must align with the provided information.

 Output your reasoning process within a markdown code block using the following format:

-```rationale
-[Your step-by-step reasoning here. Explain what the code does line by line, identify potential edge cases, and discuss possible errors. Be detailed and thorough.]
+```Rationale
+[Your step-by-step reasoning here. Explain what the code does line by line and discuss possible errors.]
 ```

+Don't simulate its runtime behavior, mentally executing it with specific inputs, or predicting its output
 Finally, based on your analysis, state your conclusion about the code's correctness (either "Yes" or "No") using the following format:

-Final Answer: (Yes or No)
+Final Answer:
+(Yes or No)
 """


@@ -37,18 +40,18 @@ Diff (Only if Correctness is "No"):


 def transform_preference_to_qwq_prompt(item):
-    assert all(len(item[x]) == 1 for x in ["prompt", "chosen", "rejected"])
-    problem = item["prompt"][0]["content"]
+    assert all(len(item[x]) == 1 for x in ["messages", "chosen", "rejected"])
+    problem = item["messages"][0]["content"]
    chosen_code = item["chosen"][0]["content"]
    rejected_code = item["rejected"][0]["content"]

-    diff = unified_diff(
+    diff = "".join(unified_diff(
        extract_code(rejected_code).splitlines(keepends=True),
        extract_code(chosen_code).splitlines(keepends=True),
        fromfile="incorrect.py",
        tofile="correct.py",
        n=1,
-    )
+    ))

    sys_message = {"role": "system", "content": SYS_PROMPT}
    chosen_message = {
@@ -68,14 +71,12 @@ def transform_preference_to_qwq_prompt(item):
        {
            "messages": [sys_message, chosen_message],
            "eval_result": True,
-            "problem_id": item["problem_id"],
-            "raw": item["prompt"] + item["chosen"],
+            "raw": item["messages"] + item["chosen"],
        },
        {
            "messages": [sys_message, rejected_message],
            "eval_result": False,
-            "problem_id": item["problem_id"],
-            "raw": item["prompt"] + item["rejected"],
+            "raw": item["messages"] + item["rejected"],
        },
    )

@@ -96,17 +97,17 @@ def extract_rationale(text):

 def transform_qwqout_to_trainset(item):
    messages = item["raw"]
-    rationale = extract_rationale(item["messages"][-1])
+    rationale = item["messages"][-1]["content"]

-    messages += [
+    response = [
        {"role": "user", "content": "Please analyze your code step by step."},
        {"role": "assistant", "content": rationale},
    ]

-    messages += mk_critic_verify(item["eval_result"])
+    response += mk_critic_verify(item["eval_result"])

    return {
-        "messages": messages,
+        "question": messages,
+        "response": response,
        "eval_result": item["eval_result"],
-        "problem_id": item["problem_id"],
    }
--- a/codecritic/utils/vllm.py
+++ b/codecritic/utils/vllm.py
@@ -3,7 +3,7 @@ from vllm import LLM, SamplingParams
 import os
 from concurrent.futures import ProcessPoolExecutor
 from itertools import chain, combinations
-from functools import partial, wraps
+from functools import partial
 import subprocess

 import numpy as np
@@ -184,7 +184,7 @@ def score_worker(cuda_device, prompts, model_path, score_token):
        positive_token = score_token[0]
        positive_logprob = logprob.get(positive_token)
        positive_prob = np.exp(positive_logprob.logprob) if positive_logprob else 0
-        return positive_prob
+        return {"score": positive_prob}

    def compute_score_twotoken(logprob):
        positive_token, negative_token = score_token[0], score_token[1]
@@ -194,7 +194,10 @@ def score_worker(cuda_device, prompts, model_path, score_token):
        negative_logprob = logprob.get(negative_token)
        negative_prob = np.exp(negative_logprob.logprob) if negative_logprob else 0

-        return positive_prob / (positive_prob + negative_prob)
+        return {
+            "score": positive_prob / (positive_prob + negative_prob),
+            "uncertainty": 1 - (positive_prob + negative_prob)
+        }

    if len(score_token) == 1:
        compute_score = compute_score_onetoken
@@ -229,10 +232,10 @@ def score_worker(cuda_device, prompts, model_path, score_token):
    for item, output in zip(prompts, outputs):
        for response in output.outputs:
            # response.logprobs: list[dict[int, Logprob]] https://github.com/vllm-project/vllm/blob/main/vllm/sequence.py
-            score = compute_score(response.logprobs[0])
+            scores = compute_score(response.logprobs[0])
            text = response.text

-            results.append({**item, "score": score, "critic_text": text})
+            results.append({**item, **scores, "critic_text": text})

    return results