Commit 0e43e25c by nanziyuan

fix bugs in vllm & revise the scoring calculation method

parent 80ebb3b6
......@@ -20,6 +20,8 @@ def get_distance(connection_type):
return 2
elif connection_type == "PBX":
return 3
elif connection_type == "PXB":
return 3
elif connection_type == "PHB":
return 4
elif connection_type == "NODE":
......@@ -27,7 +29,7 @@ def get_distance(connection_type):
elif connection_type == "SYS":
return 6
else:
raise RuntimeError("Unknown connection type")
raise RuntimeError("Unknown connection type " + connection_type)
def get_gpu_topology():
......@@ -134,7 +136,7 @@ def generate_worker(cuda_device, prompts, model_path, sampling_params):
tensor_parallel_size=len(cuda_device))
tokenizer = llm.get_tokenizer()
stop_tokens = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")]
stop_tokens = [tokenizer.eos_token_id]
print(f"SUCCESS: load llm {model_path} on cuda {cuda_device}")
vllm_sampling_params = SamplingParams(
......@@ -145,6 +147,7 @@ def generate_worker(cuda_device, prompts, model_path, sampling_params):
stop_token_ids=stop_tokens
)
print("Sampling params:", vllm_sampling_params)
def messages_to_text(messages):
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
......@@ -170,13 +173,36 @@ def generate_worker(cuda_device, prompts, model_path, sampling_params):
message = {"role": "assistant", "content": generated_text}
messages.append(message)
item["messages"].append(message)
results.append({**item, "messages": messages})
return results
def score_worker(cuda_device, prompts, model_path, score_token):
def compute_score_onetoken(logprob):
positive_token = score_token[0]
positive_logprob = logprob.get(positive_token)
positive_prob = np.exp(positive_logprob.logprob) if positive_logprob else 0
return positive_prob
def compute_score_twotoken(logprob):
positive_token, negative_token = score_token[0], score_token[1]
positive_logprob = logprob.get(positive_token)
positive_prob = np.exp(positive_logprob.logprob) if positive_logprob else 0
negative_logprob = logprob.get(negative_token)
negative_prob = np.exp(negative_logprob.logprob) if negative_logprob else 0
return positive_prob / (positive_prob + negative_prob)
if len(score_token) == 1:
compute_score = compute_score_onetoken
elif len(score_token) == 2:
compute_score = compute_score_twotoken
else:
raise NotImplementedError("param: score_token length shoud be 1 or 2")
os.environ["CUDA_VISIBLE_DEVICES"] = ','.join(cuda_device)
llm = LLM(model=model_path,
......@@ -203,9 +229,7 @@ def score_worker(cuda_device, prompts, model_path, score_token):
for item, output in zip(prompts, outputs):
for response in output.outputs:
# response.logprobs: list[dict[int, Logprob]] https://github.com/vllm-project/vllm/blob/main/vllm/sequence.py
sample_logprobs = response.logprobs
logprob = sample_logprobs[0].get(score_token)
score = np.exp(logprob.logprob) if logprob else 0
score = compute_score(response.logprobs[0])
text = response.text
results.append({**item, "score": score, "critic_text": text})
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment