Unverified Commit c8b9c355 by zhou fan Committed by GitHub

fix the split placement example (#281)

The split placement example is outdated, I tried it and encountered some
errors. To address this, the following changes were made in this PR
1. Copied the content from `verl/trainer/config/ppo_trainer.yaml` to
`examples/split_placement/config/ppo_trainer_split.yaml`
2. Copied `RayPPOTrainer.fit` method into the `fit` func in
`examples/split_placement/split_monkey_patch.py` and modified it to get
the futures of `critic_output` and `actor_output`
parent 828df7e8
...@@ -44,7 +44,7 @@ def update_critic(self, data: DataProto): ...@@ -44,7 +44,7 @@ def update_critic(self, data: DataProto):
... ...
``` ```
We can also parallelize the computation of `ref_log_prob` and `values` and `rewards` in the split placement. For simplicity of the tutorial, we We can also parallelize the computation of `ref_log_prob` and `values` and `rewards` in the split placement. For simplicity of the tutorial, we don't do this in this example.
### Step 3: Execute these operation in parallel in the single controller process ### Step 3: Execute these operation in parallel in the single controller process
To implement the parallel execution of the actor and critic update, the only thing we need to modify in the `ray_trainer.py` is to `get` the concurrent `futures` on the single controller process. To implement the parallel execution of the actor and critic update, the only thing we need to modify in the `ray_trainer.py` is to `get` the concurrent `futures` on the single controller process.
......
...@@ -9,24 +9,32 @@ data: ...@@ -9,24 +9,32 @@ data:
val_batch_size: 1312 val_batch_size: 1312
return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs
return_raw_chat: False return_raw_chat: False
shuffle: True
actor_rollout_ref: actor_rollout_ref:
hybrid_engine: True hybrid_engine: True
model: model:
path: ~/models/deepseek-llm-7b-chat path: ~/models/deepseek-llm-7b-chat
external_lib: null external_lib: null
override_config: {} override_config: { }
enable_gradient_checkpointing: False enable_gradient_checkpointing: True
use_remove_padding: False
actor: actor:
strategy: fsdp # This is for backward-compatibility strategy: fsdp # This is for backward-compatibility
ppo_mini_batch_size: 256 ppo_mini_batch_size: 256
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: 64 ppo_micro_batch_size_per_gpu: null
use_dynamic_bsz: False
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
grad_clip: 1.0 grad_clip: 1.0
clip_ratio: 0.2 clip_ratio: 0.2
entropy_coeff: 0.001 entropy_coeff: 0.001
use_kl_loss: False # True for GRPO
kl_loss_coef: 0.001 # for grpo
kl_loss_type: low_var_kl # for grpo
ppo_epochs: 1 ppo_epochs: 1
shuffle: True shuffle: False
ulysses_sequence_parallel_size: 1 # sp size
optim: optim:
lr: 1e-6 lr: 1e-6
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
...@@ -40,6 +48,7 @@ actor_rollout_ref: ...@@ -40,6 +48,7 @@ actor_rollout_ref:
param_offload: False param_offload: False
grad_offload: False grad_offload: False
optimizer_offload: False optimizer_offload: False
fsdp_size: -1
ref: ref:
fsdp_config: fsdp_config:
param_offload: False param_offload: False
...@@ -47,7 +56,10 @@ actor_rollout_ref: ...@@ -47,7 +56,10 @@ actor_rollout_ref:
# transformer_layer_cls_to_wrap: None # transformer_layer_cls_to_wrap: None
min_num_params: 0 min_num_params: 0
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 128 log_prob_micro_batch_size_per_gpu: null
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
rollout: rollout:
name: vllm name: vllm
temperature: 1.0 temperature: 1.0
...@@ -66,7 +78,11 @@ actor_rollout_ref: ...@@ -66,7 +78,11 @@ actor_rollout_ref:
max_num_batched_tokens: 8192 max_num_batched_tokens: 8192
max_num_seqs: 1024 max_num_seqs: 1024
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 128 log_prob_micro_batch_size_per_gpu: null
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
disable_log_stats: True
enable_chunked_prefill: True # could get higher throughput
# for hf rollout # for hf rollout
do_sample: True do_sample: True
# number of responses (i.e. num sample times) # number of responses (i.e. num sample times)
...@@ -83,9 +99,10 @@ critic: ...@@ -83,9 +99,10 @@ critic:
model: model:
path: ~/models/deepseek-llm-7b-chat path: ~/models/deepseek-llm-7b-chat
tokenizer_path: ${actor_rollout_ref.model.path} tokenizer_path: ${actor_rollout_ref.model.path}
override_config: {} override_config: { }
external_lib: ${actor_rollout_ref.model.external_lib} external_lib: ${actor_rollout_ref.model.external_lib}
enable_gradient_checkpointing: False enable_gradient_checkpointing: True
use_remove_padding: False
fsdp_config: fsdp_config:
param_offload: False param_offload: False
grad_offload: False grad_offload: False
...@@ -93,9 +110,16 @@ critic: ...@@ -93,9 +110,16 @@ critic:
wrap_policy: wrap_policy:
# transformer_layer_cls_to_wrap: None # transformer_layer_cls_to_wrap: None
min_num_params: 0 min_num_params: 0
fsdp_size: -1
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: 64 ppo_micro_batch_size_per_gpu: null
forward_micro_batch_size: ${critic.ppo_micro_batch_size}
forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
ulysses_sequence_parallel_size: 1 # sp size
ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
shuffle: ${actor_rollout_ref.actor.shuffle} shuffle: ${actor_rollout_ref.actor.shuffle}
grad_clip: 1.0 grad_clip: 1.0
...@@ -108,12 +132,18 @@ reward_model: ...@@ -108,12 +132,18 @@ reward_model:
input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical
path: ~/models/FsfairX-LLaMA3-RM-v0.1 path: ~/models/FsfairX-LLaMA3-RM-v0.1
external_lib: ${actor_rollout_ref.model.external_lib} external_lib: ${actor_rollout_ref.model.external_lib}
use_remove_padding: False
fsdp_config: fsdp_config:
min_num_params: 0 min_num_params: 0
param_offload: False param_offload: False
fsdp_size: -1
micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
micro_batch_size_per_gpu: 64 micro_batch_size_per_gpu: null # set a number
max_length: null max_length: null
ulysses_sequence_parallel_size: 1 # sp size
use_dynamic_bsz: ${critic.use_dynamic_bsz}
forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
reward_manager: naive
algorithm: algorithm:
gamma: 1.0 gamma: 1.0
...@@ -126,13 +156,18 @@ algorithm: ...@@ -126,13 +156,18 @@ algorithm:
trainer: trainer:
total_epochs: 30 total_epochs: 30
total_training_steps: null
project_name: verl_examples project_name: verl_examples
experiment_name: gsm8k experiment_name: gsm8k
logger: ['console', 'wandb'] logger: [ 'console', 'wandb' ]
val_generations_to_log_to_wandb: 0
nnodes: 1 nnodes: 1
n_gpus_per_node: 8 n_gpus_per_node: 8
save_freq: -1 save_freq: -1
test_freq: 2 # auto: find the last ckpt to resume. If can't find, start from scratch
resume_mode: auto # or auto or resume_path if
resume_from_path: False
test_freq: -1
critic_warmup: 0 critic_warmup: 0
default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name} default_hdfs_dir: null
default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name} default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment