fix bugs in vllm & revise the scoring calculation method

0e43e25c · nanziyuan · 80ebb3b6 · 0e43e25c
Commit 0e43e25c authored Dec 27, 2024 by nanziyuan
Hide whitespace changes
Inline Side-by-side

Showing with 30 additions and 6 deletions

codecritic/utils/vllm.py
+30 -6

No files found.
--- a/codecritic/utils/vllm.py
+++ b/codecritic/utils/vllm.py
@@ -20,6 +20,8 @@ def get_distance(connection_type):
        return 2
    elif connection_type == "PBX":
        return 3
+    elif connection_type == "PXB":
+        return 3
    elif connection_type == "PHB":
        return 4
    elif connection_type == "NODE":
@@ -27,7 +29,7 @@ def get_distance(connection_type):
    elif connection_type == "SYS":
        return 6
    else:
-        raise RuntimeError("Unknown connection type")
+        raise RuntimeError("Unknown connection type " + connection_type)


 def get_gpu_topology():
@@ -134,7 +136,7 @@ def generate_worker(cuda_device, prompts, model_path, sampling_params):
              tensor_parallel_size=len(cuda_device))

    tokenizer = llm.get_tokenizer()
-    stop_tokens = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
+    stop_tokens = [tokenizer.eos_token_id]
    print(f"SUCCESS: load llm {model_path} on cuda {cuda_device}")

    vllm_sampling_params = SamplingParams(
@@ -145,6 +147,7 @@ def generate_worker(cuda_device, prompts, model_path, sampling_params):
        stop_token_ids=stop_tokens
    )

+    print("Sampling params:", vllm_sampling_params)

    def messages_to_text(messages):
        text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
@@ -170,13 +173,36 @@ def generate_worker(cuda_device, prompts, model_path, sampling_params):
                message = {"role": "assistant", "content": generated_text}
            messages.append(message)

-            item["messages"].append(message)
            results.append({**item, "messages": messages})

    return results


 def score_worker(cuda_device, prompts, model_path, score_token):
+
+    def compute_score_onetoken(logprob):
+        positive_token = score_token[0]
+        positive_logprob = logprob.get(positive_token)
+        positive_prob = np.exp(positive_logprob.logprob) if positive_logprob else 0
+        return positive_prob
+
+    def compute_score_twotoken(logprob):
+        positive_token, negative_token = score_token[0], score_token[1]
+        positive_logprob = logprob.get(positive_token)
+        positive_prob = np.exp(positive_logprob.logprob) if positive_logprob else 0
+
+        negative_logprob = logprob.get(negative_token)
+        negative_prob = np.exp(negative_logprob.logprob) if negative_logprob else 0
+
+        return positive_prob / (positive_prob + negative_prob)
+
+    if len(score_token) == 1:
+        compute_score = compute_score_onetoken
+    elif len(score_token) == 2:
+        compute_score = compute_score_twotoken
+    else:
+        raise NotImplementedError("param: score_token length shoud be 1 or 2")
+
    os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(cuda_device)

    llm = LLM(model=model_path,
@@ -203,9 +229,7 @@ def score_worker(cuda_device, prompts, model_path, score_token):
    for item, output in zip(prompts, outputs):
        for response in output.outputs:
            # response.logprobs: list[dict[int, Logprob]] https://github.com/vllm-project/vllm/blob/main/vllm/sequence.py
-            sample_logprobs = response.logprobs
-            logprob = sample_logprobs[0].get(score_token)
-            score = np.exp(logprob.logprob) if logprob else 0
+            score = compute_score(response.logprobs[0])
            text = response.text

            results.append({**item, "score": score, "critic_text": text})