Commit 31ee9176 by Yaoyu Zhu

fix bugs in using acc as dynamic sampling metric

parent 119a9e2d
...@@ -110,10 +110,10 @@ python3 -m verl.trainer.main_ppo \ ...@@ -110,10 +110,10 @@ python3 -m verl.trainer.main_ppo \
custom_reward_function.overlong_buffer.enable=${enable_overlong_buffer} \ custom_reward_function.overlong_buffer.enable=${enable_overlong_buffer} \
custom_reward_function.overlong_buffer.len=${overlong_buffer_len} \ custom_reward_function.overlong_buffer.len=${overlong_buffer_len} \
custom_reward_function.overlong_buffer.penalty_factor=${overlong_penalty_factor} \ custom_reward_function.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
custom_reward_function.path=verl/utils/reward_score/codev.py \ custom_reward_function.train.path=verl/utils/reward_score/codev.py \
custom_reward_function.name=compute_score_wrapper \ custom_reward_function.train.name=compute_score_wrapper \
custom_reward_function.continuous_reward.enable=True \ custom_reward_function.train.continuous_reward.enable=True \
custom_reward_function.continuous_reward.error_ratio_threshold=0.5 \ custom_reward_function.train.continuous_reward.err_threshold=0.5 \
algorithm.kl_ctrl.kl_coef=0.0 \ algorithm.kl_ctrl.kl_coef=0.0 \
trainer.critic_warmup=0 \ trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \ trainer.logger=['console','wandb'] \
......
...@@ -977,6 +977,7 @@ class RayPPOTrainer(object): ...@@ -977,6 +977,7 @@ class RayPPOTrainer(object):
new_batch = new_batch.union(reward_tensor) new_batch = new_batch.union(reward_tensor)
# we combine with rule-based rm # we combine with rule-based rm
# print("=="*30)
reward_extra_infos_dict: dict[str, list] reward_extra_infos_dict: dict[str, list]
try: try:
reward_result = self.reward_fn(new_batch, return_dict=True) reward_result = self.reward_fn(new_batch, return_dict=True)
...@@ -987,9 +988,11 @@ class RayPPOTrainer(object): ...@@ -987,9 +988,11 @@ class RayPPOTrainer(object):
reward_tensor = self.reward_fn(new_batch) reward_tensor = self.reward_fn(new_batch)
reward_extra_infos_dict = {} reward_extra_infos_dict = {}
# print("=="*30)
# print('In ray trainer, keys are', new_batch.batch.keys())
new_batch.batch['token_level_scores'] = reward_tensor new_batch.batch['token_level_scores'] = reward_tensor
print(f'{list(reward_extra_infos_dict.keys())=}') # print(f'{list(reward_extra_infos_dict.batch.keys())=}')
if reward_extra_infos_dict: if reward_extra_infos_dict:
new_batch.non_tensor_batch.update({ new_batch.non_tensor_batch.update({
k: np.array(v) for k, v in reward_extra_infos_dict.items() k: np.array(v) for k, v in reward_extra_infos_dict.items()
......
...@@ -74,6 +74,7 @@ class NaiveRewardManager: ...@@ -74,6 +74,7 @@ class NaiveRewardManager:
) )
scores.append(score) scores.append(score)
data.batch['acc'] = torch.tensor(scores, dtype=torch.float32, device=prompt_ids.device) data.batch['acc'] = torch.tensor(scores, dtype=torch.float32, device=prompt_ids.device)
data.non_tensor_batch['acc'] = data.batch['acc'].numpy()
return scores return scores
def __call__(self, data: DataProto, return_dict: bool = False): def __call__(self, data: DataProto, return_dict: bool = False):
......
...@@ -138,6 +138,9 @@ class PrimeRewardManager: ...@@ -138,6 +138,9 @@ class PrimeRewardManager:
print(f"Unexpected error in batched reward computing. Setting all as 0.: {e}") print(f"Unexpected error in batched reward computing. Setting all as 0.: {e}")
scores = [0. for _ in range(len(sequences_str))] scores = [0. for _ in range(len(sequences_str))]
data.batch['acc'] = torch.tensor(scores, dtype=torch.float32, device=prompt_ids.device) data.batch['acc'] = torch.tensor(scores, dtype=torch.float32, device=prompt_ids.device)
data.non_tensor_batch['acc'] = data.batch['acc'].numpy()
# print('Calculated reward scores in PrimeRewardManager!!!')
# print('In verify of PrimeRewardManager, keys are', data.batch.keys())
return scores return scores
def __call__(self, data: DataProto, return_dict: bool = False): def __call__(self, data: DataProto, return_dict: bool = False):
...@@ -166,6 +169,7 @@ class PrimeRewardManager: ...@@ -166,6 +169,7 @@ class PrimeRewardManager:
extra_info = data.non_tensor_batch.get('extra_info', [None] * len(data_sources)) extra_info = data.non_tensor_batch.get('extra_info', [None] * len(data_sources))
scores = self.verify(data) scores = self.verify(data)
# print('In __call__ of PrimeRewardManager, keys are', data.batch.keys())
for i in range(len(data)): for i in range(len(data)):
reward = scores[i] reward = scores[i]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment