Commit 31ee9176 by Yaoyu Zhu

fix bugs in using acc as dynamic sampling metric

parent 119a9e2d
......@@ -110,10 +110,10 @@ python3 -m verl.trainer.main_ppo \
custom_reward_function.overlong_buffer.enable=${enable_overlong_buffer} \
custom_reward_function.overlong_buffer.len=${overlong_buffer_len} \
custom_reward_function.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
custom_reward_function.path=verl/utils/reward_score/codev.py \
custom_reward_function.name=compute_score_wrapper \
custom_reward_function.continuous_reward.enable=True \
custom_reward_function.continuous_reward.error_ratio_threshold=0.5 \
custom_reward_function.train.path=verl/utils/reward_score/codev.py \
custom_reward_function.train.name=compute_score_wrapper \
custom_reward_function.train.continuous_reward.enable=True \
custom_reward_function.train.continuous_reward.err_threshold=0.5 \
algorithm.kl_ctrl.kl_coef=0.0 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
......
......@@ -977,6 +977,7 @@ class RayPPOTrainer(object):
new_batch = new_batch.union(reward_tensor)
# we combine with rule-based rm
# print("=="*30)
reward_extra_infos_dict: dict[str, list]
try:
reward_result = self.reward_fn(new_batch, return_dict=True)
......@@ -987,9 +988,11 @@ class RayPPOTrainer(object):
reward_tensor = self.reward_fn(new_batch)
reward_extra_infos_dict = {}
# print("=="*30)
# print('In ray trainer, keys are', new_batch.batch.keys())
new_batch.batch['token_level_scores'] = reward_tensor
print(f'{list(reward_extra_infos_dict.keys())=}')
# print(f'{list(reward_extra_infos_dict.batch.keys())=}')
if reward_extra_infos_dict:
new_batch.non_tensor_batch.update({
k: np.array(v) for k, v in reward_extra_infos_dict.items()
......
......@@ -74,6 +74,7 @@ class NaiveRewardManager:
)
scores.append(score)
data.batch['acc'] = torch.tensor(scores, dtype=torch.float32, device=prompt_ids.device)
data.non_tensor_batch['acc'] = data.batch['acc'].numpy()
return scores
def __call__(self, data: DataProto, return_dict: bool = False):
......
......@@ -138,6 +138,9 @@ class PrimeRewardManager:
print(f"Unexpected error in batched reward computing. Setting all as 0.: {e}")
scores = [0. for _ in range(len(sequences_str))]
data.batch['acc'] = torch.tensor(scores, dtype=torch.float32, device=prompt_ids.device)
data.non_tensor_batch['acc'] = data.batch['acc'].numpy()
# print('Calculated reward scores in PrimeRewardManager!!!')
# print('In verify of PrimeRewardManager, keys are', data.batch.keys())
return scores
def __call__(self, data: DataProto, return_dict: bool = False):
......@@ -166,6 +169,7 @@ class PrimeRewardManager:
extra_info = data.non_tensor_batch.get('extra_info', [None] * len(data_sources))
scores = self.verify(data)
# print('In __call__ of PrimeRewardManager, keys are', data.batch.keys())
for i in range(len(data)):
reward = scores[i]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment