fix the split placement example (#281)

The split placement example is outdated, I tried it and encountered some errors. To address this, the following changes were made in this PR 1. Copied the content from `verl/trainer/config/ppo_trainer.yaml` to `examples/split_placement/config/ppo_trainer_split.yaml` 2. Copied `RayPPOTrainer.fit` method into the `fit` func in `examples/split_placement/split_monkey_patch.py` and modified it to get the futures of `critic_output` and `actor_output`

fix the split placement example (#281)
The split placement example is outdated, I tried it and encountered some errors. To address this, the following changes were made in this PR 1. Copied the content from `verl/trainer/config/ppo_trainer.yaml` to `examples/split_placement/config/ppo_trainer_split.yaml` 2. Copied `RayPPOTrainer.fit` method into the `fit` func in `examples/split_placement/split_monkey_patch.py` and modified it to get the futures of `critic_output` and `actor_output`
c8b9c355 · zhou fan · GitHub · 828df7e8 · c8b9c355 · c8b9c355
Unverified Commit c8b9c355 authored Feb 16, 2025 by zhou fan Committed by GitHub Feb 16, 2025
Expand all Hide whitespace changes
Inline Side-by-side

Showing with 51 additions and 15 deletions

examples/split_placement/README.md
+1 -1

examples/split_placement/config/ppo_trainer_split.yaml
+50 -14

examples/split_placement/split_monkey_patch.py
+0 -0

No files found.
--- a/examples/split_placement/README.md
+++ b/examples/split_placement/README.md
@@ -44,7 +44,7 @@ def update_critic(self, data: DataProto):
    ...
 ```
-We can also parallelize the computation of `ref_log_prob` and `values` and `rewards` in the split placement. For simplicity of the tutorial, we 
+We can also parallelize the computation of `ref_log_prob` and `values` and `rewards` in the split placement. For simplicity of the tutorial, we don't do this in this example.
 ### Step 3: Execute these operation in parallel in the single controller process
 To implement the parallel execution of the actor and critic update, the only thing we need to modify in the `ray_trainer.py` is to `get` the concurrent  `futures` on the single controller process.

--- a/examples/split_placement/config/ppo_trainer_split.yaml
+++ b/examples/split_placement/config/ppo_trainer_split.yaml
@@ -9,24 +9,32 @@ data:
  val_batch_size: 1312
  return_raw_input_ids: False  # This should be set to true when the tokenizer between policy and rm differs
  return_raw_chat: False
+  shuffle: True
 actor_rollout_ref:
  hybrid_engine: True
  model:
    path: ~/models/deepseek-llm-7b-chat
    external_lib: null
-    override_config: {}
+    override_config: { }
-    enable_gradient_checkpointing: False
+    enable_gradient_checkpointing: True
+    use_remove_padding: False
  actor:
    strategy: fsdp  # This is for backward-compatibility
    ppo_mini_batch_size: 256
    ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
-    ppo_micro_batch_size_per_gpu: 64
+    ppo_micro_batch_size_per_gpu: null
+    use_dynamic_bsz: False
+    ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
    grad_clip: 1.0
    clip_ratio: 0.2
    entropy_coeff: 0.001
+    use_kl_loss: False # True for GRPO
+    kl_loss_coef: 0.001 # for grpo
+    kl_loss_type: low_var_kl # for grpo
    ppo_epochs: 1
-    shuffle: True
+    shuffle: False
+    ulysses_sequence_parallel_size: 1 # sp size
    optim:
      lr: 1e-6
      lr_warmup_steps_ratio: 0.  # the total steps will be injected during runtime
@@ -40,6 +48,7 @@ actor_rollout_ref:
      param_offload: False
      grad_offload: False
      optimizer_offload: False
+      fsdp_size: -1
  ref:
    fsdp_config:
      param_offload: False
@@ -47,7 +56,10 @@ actor_rollout_ref:
        # transformer_layer_cls_to_wrap: None
        min_num_params: 0
    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 128
+    log_prob_micro_batch_size_per_gpu: null
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
  rollout:
    name: vllm
    temperature: 1.0
@@ -66,7 +78,11 @@ actor_rollout_ref:
    max_num_batched_tokens: 8192
    max_num_seqs: 1024
    log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
-    log_prob_micro_batch_size_per_gpu: 128
+    log_prob_micro_batch_size_per_gpu: null
+    log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+    log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
+    disable_log_stats: True
+    enable_chunked_prefill: True # could get higher throughput
    # for hf rollout
    do_sample: True
    # number of responses (i.e. num sample times)
@@ -83,9 +99,10 @@ critic:
  model:
    path: ~/models/deepseek-llm-7b-chat
    tokenizer_path: ${actor_rollout_ref.model.path}
-    override_config: {}
+    override_config: { }
    external_lib: ${actor_rollout_ref.model.external_lib}
-    enable_gradient_checkpointing: False
+    enable_gradient_checkpointing: True
+    use_remove_padding: False
    fsdp_config:
      param_offload: False
      grad_offload: False
@@ -93,9 +110,16 @@ critic:
      wrap_policy:
        # transformer_layer_cls_to_wrap: None
        min_num_params: 0
+      fsdp_size: -1
  ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
  ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
-  ppo_micro_batch_size_per_gpu: 64
+  ppo_micro_batch_size_per_gpu: null
+  forward_micro_batch_size: ${critic.ppo_micro_batch_size}
+  forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
+  use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
+  ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
+  forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
+  ulysses_sequence_parallel_size: 1 # sp size
  ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
  shuffle: ${actor_rollout_ref.actor.shuffle}
  grad_clip: 1.0
@@ -108,12 +132,18 @@ reward_model:
    input_tokenizer: ${actor_rollout_ref.model.path}  # set this to null if the chat template is identical
    path: ~/models/FsfairX-LLaMA3-RM-v0.1
    external_lib: ${actor_rollout_ref.model.external_lib}
+    use_remove_padding: False
    fsdp_config:
      min_num_params: 0
      param_offload: False
+      fsdp_size: -1
  micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
-  micro_batch_size_per_gpu: 64
+  micro_batch_size_per_gpu: null # set a number
  max_length: null
+  ulysses_sequence_parallel_size: 1 # sp size
+  use_dynamic_bsz: ${critic.use_dynamic_bsz}
+  forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
+  reward_manager: naive
 algorithm:
  gamma: 1.0
@@ -126,13 +156,18 @@ algorithm:
 trainer:
  total_epochs: 30
+  total_training_steps: null
  project_name: verl_examples
  experiment_name: gsm8k
-  logger: ['console', 'wandb']
+  logger: [ 'console', 'wandb' ]
+  val_generations_to_log_to_wandb: 0
  nnodes: 1
  n_gpus_per_node: 8
  save_freq: -1
-  test_freq: 2
+  # auto: find the last ckpt to resume. If can't find, start from scratch
+  resume_mode: auto # or auto or resume_path if 
+  resume_from_path: False
+  test_freq: -1
  critic_warmup: 0
-  default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name}
+  default_hdfs_dir: null
  default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
\ No newline at end of file
--- a/examples/split_placement/split_monkey_patch.py
+++ b/examples/split_placement/split_monkey_patch.py