Commit ed46c1a3 by 苏舞仙

no ref

parent 43b239ac
...@@ -111,7 +111,6 @@ class TaskRunner: ...@@ -111,7 +111,6 @@ class TaskRunner:
role_worker_mapping = { role_worker_mapping = {
Role.ActorRollout: ray.remote(ActorRolloutRefWorker), Role.ActorRollout: ray.remote(ActorRolloutRefWorker),
Role.Critic: ray.remote(CriticWorker), Role.Critic: ray.remote(CriticWorker),
Role.RefPolicy: ray.remote(ActorRolloutRefWorker)
} }
global_pool_id = 'global_pool' global_pool_id = 'global_pool'
...@@ -121,9 +120,12 @@ class TaskRunner: ...@@ -121,9 +120,12 @@ class TaskRunner:
mapping = { mapping = {
Role.ActorRollout: global_pool_id, Role.ActorRollout: global_pool_id,
Role.Critic: global_pool_id, Role.Critic: global_pool_id,
Role.RefPolicy: global_pool_id,
} }
if config.algorithm.kl_ctrl.kl_coef != 0 or (config.actor_rollout_ref.actor.use_kl_loss == True and config.actor_rollout_ref.actor.kl_loss_coef != 0):
role_worker_mapping[Role.RefPolicy] = ray.remote(ActorRolloutRefWorker)
mapping[Role.RefPolicy] = global_pool_id
# we should adopt a multi-source reward function here # we should adopt a multi-source reward function here
# - for rule-based rm, we directly call a reward score # - for rule-based rm, we directly call a reward score
# - for model-based rm, we call a model # - for model-based rm, we call a model
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment