Unverified Commit f2a76acd by Guangming Sheng Committed by GitHub

[BREAKING][misc] feat: change micro_batch_size to micro_batch_size_per_gpu (#136)

## Summary

This PR changes all the micro_batch_size to micro_batch_size_per_gpu.

**The Core logic of setting batch size:**
- **All algorithmic metrics** (train batch size, ppo mini batch size):
are global (from the perspective of single-controller), which will be
normalized in each Worker.
- **All performance-related parameters** (micro batch size, max token
length in dynamic batch size) are local parameters, which represent the
data sizes per GPU (i.e., each Worker).

## Main Changes

1. Change the scripts and config and delete the normalization for
micro_bsz
2. Fix CI for SFT
parent c17e6c62
...@@ -43,4 +43,4 @@ jobs: ...@@ -43,4 +43,4 @@ jobs:
- name: Running gsm8k e2e training tests with LoRA - name: Running gsm8k e2e training tests with LoRA
run: | run: |
ray stop --force ray stop --force
bash examples/sft/gsm8k/run_qwen_05_peft.sh 8 $HOME/ckpts/ bash tests/sft/run_sft_qwen05_peft.sh 8 $HOME/ckpts/
\ No newline at end of file \ No newline at end of file
...@@ -59,60 +59,79 @@ Actor/Rollout/Reference Policy ...@@ -59,60 +59,79 @@ Actor/Rollout/Reference Policy
.. code:: yaml .. code:: yaml
actor_rollout_ref: actor_rollout_ref:
hybrid_engine: True hybrid_engine: True
model: model:
path: ~/models/deepseek-llm-7b-chat path: ~/models/deepseek-llm-7b-chat
external_lib: null external_lib: null
override_config: {} override_config: { }
enable_gradient_checkpointing: False enable_gradient_checkpointing: False
actor: use_remove_padding: False
strategy: fsdp # This is for backward-compatibility actor:
ppo_mini_batch_size: 256 strategy: fsdp # This is for backward-compatibility
ppo_micro_batch_size: 64 ppo_mini_batch_size: 256
grad_clip: 1.0 ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
clip_ratio: 0.2 ppo_micro_batch_size_per_gpu: 8
entropy_coeff: 0.001 use_dynamic_bsz: False
ppo_epochs: 1 ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
shuffle: True grad_clip: 1.0
optim: clip_ratio: 0.2
lr: 1e-6 entropy_coeff: 0.001
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime use_kl_loss: False # True for GRPO
min_lr_ratio: null # only useful for warmup with cosine kl_loss_coef: 0.001 # for grpo
warmup_style: constant # select from constant/cosine kl_loss_type: low_var_kl # for grpo
total_training_steps: -1 # must be override by program ppo_epochs: 1
fsdp_config: shuffle: False
wrap_policy: ulysses_sequence_parallel_size: 1 # sp size
# transformer_layer_cls_to_wrap: None optim:
min_num_params: 0 lr: 1e-6
param_offload: False lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
grad_offload: False min_lr_ratio: null # only useful for warmup with cosine
optimizer_offload: False warmup_style: constant # select from constant/cosine
ref: total_training_steps: -1 # must be override by program
fsdp_config: fsdp_config:
param_offload: False wrap_policy:
wrap_policy: # transformer_layer_cls_to_wrap: None
# transformer_layer_cls_to_wrap: None min_num_params: 0
min_num_params: 0 param_offload: False
log_prob_micro_batch_size: 128 grad_offload: False
rollout: optimizer_offload: False
name: vllm fsdp_size: -1
temperature: 1.0 ref:
top_k: -1 # 0 for hf rollout, -1 for vllm rollout fsdp_config:
top_p: 1 param_offload: False
response_length: ${data.max_response_length} wrap_policy:
# for vllm rollout # transformer_layer_cls_to_wrap: None
dtype: bfloat16 # should align with FSDP min_num_params: 0
gpu_memory_utilization: 0.5 log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
ignore_eos: False log_prob_micro_batch_size_per_gpu: 16
enforce_eager: True log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
free_cache_engine: True log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
load_format: dummy_dtensor # or dummy_hf or dummy_megatron ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
tensor_model_parallel_size: 2 rollout:
max_num_batched_tokens: 8192 name: vllm
max_num_seqs: 1024 temperature: 1.0
log_prob_micro_batch_size: 128 top_k: -1 # 0 for hf rollout, -1 for vllm rollout
# for vllm and hf rollout top_p: 1
do_sample: True prompt_length: ${data.max_prompt_length} # not use for opensource
response_length: ${data.max_response_length}
# for vllm rollout
dtype: bfloat16 # should align with FSDP
gpu_memory_utilization: 0.5
ignore_eos: False
enforce_eager: True
free_cache_engine: True
load_format: dummy_dtensor
tensor_model_parallel_size: 2
max_num_batched_tokens: 8192
max_num_seqs: 1024
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 16
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
# for hf rollout
do_sample: True
# number of responses (i.e. num sample times)
n: 1 # > 1 for grpo
**Common config for actor, rollout and reference model** **Common config for actor, rollout and reference model**
...@@ -136,11 +155,15 @@ Actor/Rollout/Reference Policy ...@@ -136,11 +155,15 @@ Actor/Rollout/Reference Policy
- ``actor_rollout_ref.actor.ppo_mini_batch_size``: One sample is split - ``actor_rollout_ref.actor.ppo_mini_batch_size``: One sample is split
into multiple sub-batches with batch_size=ppo_mini_batch_size for PPO into multiple sub-batches with batch_size=ppo_mini_batch_size for PPO
updates updates. The ppo_mini_batch_size is a global num across all workers/gpus
- ``actor_rollout_ref.actor.ppo_micro_batch_size``: [Will be deprecated, use ppo_micro_batch_size_per_gpu]
Similar to gradient accumulation, the micro_batch_size_per_gpu for one forward pass,
trading speed for GPU memory. The value represent the global view.
- ``actor_rollout_ref.actor.ppo_micro_batch_size``: Similar to gradient - ``actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu``: Similar to gradient
accumulation, the micro_batch_size for one forward pass, trading speed accumulation, the micro_batch_size_per_gpu for one forward pass, trading speed
for GPU memory for GPU memory. The value represent the local num per gpu.
- ``actor_rollout_ref.actor.grad_clip``: Gradient clipping for actor - ``actor_rollout_ref.actor.grad_clip``: Gradient clipping for actor
updates updates
...@@ -176,8 +199,12 @@ Actor/Rollout/Reference Policy ...@@ -176,8 +199,12 @@ Actor/Rollout/Reference Policy
- ``actor_rollout_ref.ref``: FSDP config same as actor. **For models - ``actor_rollout_ref.ref``: FSDP config same as actor. **For models
larger than 7B, it's recommended to turn on offload for ref by larger than 7B, it's recommended to turn on offload for ref by
default** default**
- ``actor_rollout_ref.ref.log_prob_micro_batch_size``: The batch size
for one forward pass in the computation of ``ref_log_prob``. - ``actor_rollout_ref.ref.log_prob_micro_batch_size``: [Will be deprecate, use log_prob_micro_batch_size_per_gpu]
The batch size for one forward pass in the computation of ``ref_log_prob``. The value represent the global num.
- ``actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu``: The batch size
for one forward pass in the computation of ``ref_log_prob``. The value represent the local num per gpu.
**Rollout Model** **Rollout Model**
...@@ -201,8 +228,11 @@ Actor/Rollout/Reference Policy ...@@ -201,8 +228,11 @@ Actor/Rollout/Reference Policy
- ``tensor_model_parallel_size``: TP size for rollout. Only effective - ``tensor_model_parallel_size``: TP size for rollout. Only effective
for vllm. for vllm.
- ``log_prob_micro_batch_size``: Micro_batch_size (The batch size for - ``actor_rollout_ref.ref.log_prob_micro_batch_size``: [Will be deprecate, use log_prob_micro_batch_size_per_gpu]
one forward pass) for recalculating log_prob. The batch size for one forward pass in the computation of ``log_prob``. The value represent the global num.
- ``log_prob_micro_batch_size_per_gpu``: Micro batch size per gpu (The batch size for
one forward pass) for recalculating ``log_prob``. The value represent the local num per gpu.
- ``do_sample``: Whether to sample. If set to False, the rollout model - ``do_sample``: Whether to sample. If set to False, the rollout model
will perform greedy sampling. We disable ``do_sample`` during will perform greedy sampling. We disable ``do_sample`` during
...@@ -260,7 +290,7 @@ Reward Model ...@@ -260,7 +290,7 @@ Reward Model
fsdp_config: fsdp_config:
min_num_params: 0 min_num_params: 0
param_offload: False param_offload: False
micro_batch_size: 64 micro_batch_size_per_gpu: 16
max_length: null max_length: null
- ``reward_model.enable``: Whether to enable reward model. If False, we - ``reward_model.enable``: Whether to enable reward model. If False, we
......
...@@ -85,7 +85,7 @@ We also provide various training scripts for SFT on GSM8K dataset in `gsm8k sft ...@@ -85,7 +85,7 @@ We also provide various training scripts for SFT on GSM8K dataset in `gsm8k sft
data.val_files=$HOME/data/gsm8k/test.parquet \ data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=question \ data.prompt_key=question \
data.response_key=answer \ data.response_key=answer \
data.micro_batch_size=8 \ data.micro_batch_size_per_gpu=8 \
model.partial_pretrain=deepseek-ai/deepseek-coder-6.7b-instruct \ model.partial_pretrain=deepseek-ai/deepseek-coder-6.7b-instruct \
trainer.default_hdfs_dir=hdfs://user/verl/experiments/gsm8k/deepseek-coder-6.7b-instruct/ \ trainer.default_hdfs_dir=hdfs://user/verl/experiments/gsm8k/deepseek-coder-6.7b-instruct/ \
trainer.project_name=gsm8k-sft \ trainer.project_name=gsm8k-sft \
...@@ -136,21 +136,20 @@ The script of run_deepseek7b_llm.sh ...@@ -136,21 +136,20 @@ The script of run_deepseek7b_llm.sh
actor_rollout_ref.model.path=~/models/deepseek-llm-7b-chat \ actor_rollout_ref.model.path=~/models/deepseek-llm-7b-chat \
actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=64 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.micro_batch_size=256 \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=128 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \ actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \ critic.optim.lr=1e-5 \
critic.model.path=~/models/deepseek-llm-7b-chat \ critic.model.path=~/models/deepseek-llm-7b-chat \
critic.model.enable_gradient_checkpointing=False \ critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=64 \ critic.ppo_micro_batch_size_per_gpu=16 \
critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \ critic.model.fsdp_config.optimizer_offload=False \
......
...@@ -92,14 +92,14 @@ Set the ``data.train_files`` ,\ ``data.val_files``, ``actor_rollout_ref.model.pa ...@@ -92,14 +92,14 @@ Set the ``data.train_files`` ,\ ``data.val_files``, ``actor_rollout_ref.model.pa
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \ actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \ actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.ppo_micro_batch_size=4 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size=8 \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=4 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
critic.optim.lr=1e-5 \ critic.optim.lr=1e-5 \
critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \ critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \
critic.ppo_micro_batch_size=4 \ critic.ppo_micro_batch_size_per_gpu=4 \
algorithm.kl_ctrl.kl_coef=0.001 \ algorithm.kl_ctrl.kl_coef=0.001 \
trainer.logger=['console'] \ trainer.logger=['console'] \
+trainer.val_before_train=False \ +trainer.val_before_train=False \
...@@ -133,8 +133,8 @@ If you encounter out of memory issues with HBM less than 32GB, enable the follow ...@@ -133,8 +133,8 @@ If you encounter out of memory issues with HBM less than 32GB, enable the follow
.. code-block:: bash .. code-block:: bash
actor_rollout_ref.actor.ppo_micro_batch_size=1 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
critic.ppo_micro_batch_size=1 \ critic.ppo_micro_batch_size_per_gpu=1 \
For the full set of configs, please refer to :ref:`config-explain-page` for detailed explaination and performance tuning. For the full set of configs, please refer to :ref:`config-explain-page` for detailed explaination and performance tuning.
......
...@@ -12,7 +12,7 @@ python3 -m verl.trainer.main_ppo \ ...@@ -12,7 +12,7 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=128 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=80 \
actor_rollout_ref.actor.use_kl_loss=True \ actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \ actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
...@@ -20,16 +20,16 @@ python3 -m verl.trainer.main_ppo \ ...@@ -20,16 +20,16 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=256 \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=160 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \ actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size=256 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=160 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \ actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \ algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \ trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \ trainer.logger=['console'] \
trainer.project_name='verl_grpo_example_gsm8k' \ trainer.project_name='verl_grpo_example_gsm8k' \
trainer.experiment_name='deepseek_llm_7b_function_rm' \ trainer.experiment_name='deepseek_llm_7b_function_rm' \
trainer.n_gpus_per_node=8 \ trainer.n_gpus_per_node=8 \
......
...@@ -14,7 +14,7 @@ python3 -m verl.trainer.main_ppo \ ...@@ -14,7 +14,7 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=128 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=80 \
actor_rollout_ref.actor.use_kl_loss=True \ actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \ actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \ actor_rollout_ref.actor.kl_loss_type=low_var_kl \
...@@ -22,12 +22,12 @@ python3 -m verl.trainer.main_ppo \ ...@@ -22,12 +22,12 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=256 \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=160 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \ actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size=256 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=160 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \ actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \ algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \ trainer.critic_warmup=0 \
......
...@@ -11,21 +11,22 @@ python3 -m verl.trainer.main_ppo \ ...@@ -11,21 +11,22 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=32 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \ actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=128 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \ actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \ critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \ critic.model.use_remove_padding=True \
critic.model.path=deepseek-ai/deepseek-llm-7b-chat \ critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
critic.model.enable_gradient_checkpointing=False \ critic.model.enable_gradient_checkpointing=True \
critic.ppo_micro_batch_size=32 \ critic.ppo_micro_batch_size_per_gpu=32 \
critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \ critic.model.fsdp_config.optimizer_offload=False \
...@@ -37,4 +38,5 @@ python3 -m verl.trainer.main_ppo \ ...@@ -37,4 +38,5 @@ python3 -m verl.trainer.main_ppo \
trainer.n_gpus_per_node=8 \ trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \ trainer.nnodes=1 \
trainer.save_freq=-1 \ trainer.save_freq=-1 \
trainer.test_freq=1 \
trainer.total_epochs=15 $@ trainer.total_epochs=15 $@
...@@ -11,24 +11,24 @@ python3 -m verl.trainer.main_ppo \ ...@@ -11,24 +11,24 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=128 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \
actor_rollout_ref.actor.ulysses_sequence_parallel_size=2 \ actor_rollout_ref.actor.ulysses_sequence_parallel_size=2 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \ actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=256 \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.ref.log_prob_micro_batch_size=256 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \ actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \ critic.optim.lr=1e-5 \
critic.ulysses_sequence_parallel_size=2 \ critic.ulysses_sequence_parallel_size=2 \
critic.model.use_remove_padding=True \ critic.model.use_remove_padding=True \
critic.model.path=deepseek-ai/deepseek-llm-7b-chat \ critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
critic.model.enable_gradient_checkpointing=False \ critic.model.enable_gradient_checkpointing=True \
critic.ppo_micro_batch_size=64 \ critic.ppo_micro_batch_size_per_gpu=64 \
critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \ critic.model.fsdp_config.optimizer_offload=False \
......
...@@ -13,21 +13,21 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat ...@@ -13,21 +13,21 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat
actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \ actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=128 \ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
actor_rollout_ref.actor.ppo_micro_batch_size=16 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size=16 \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=16 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.ref.param_offload=False \ actor_rollout_ref.ref.param_offload=False \
critic.optim.lr=1e-5 \ critic.optim.lr=1e-5 \
critic.model.path=deepseek-ai/deepseek-llm-7b-chat \ critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
critic.model.enable_gradient_checkpointing=False \ critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=16 \ critic.ppo_micro_batch_size_per_gpu=4 \
reward_model.enable=True \ reward_model.enable=True \
reward_model.megatron.tensor_model_parallel_size=4 \ reward_model.megatron.tensor_model_parallel_size=4 \
reward_model.model.path=deepseek-ai/deepseek-llm-7b-chat \ reward_model.model.path=deepseek-ai/deepseek-llm-7b-chat \
reward_model.micro_batch_size=16 \ reward_model.micro_batch_size_per_gpu=4 \
reward_model.param_offload=False \ reward_model.param_offload=False \
algorithm.kl_ctrl.kl_coef=0.001 \ algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \ trainer.critic_warmup=0 \
......
...@@ -18,16 +18,16 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat ...@@ -18,16 +18,16 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat
actor_rollout_ref.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \ actor_rollout_ref.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=32 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
critic.optim.lr=1e-5 \ critic.optim.lr=1e-5 \
critic.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \ critic.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \
critic.model.enable_gradient_checkpointing=False \ critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=32 \ critic.ppo_micro_batch_size_per_gpu=4 \
algorithm.kl_ctrl.kl_coef=0.001 \ algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \ trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \ trainer.logger=['console','wandb'] \
......
...@@ -10,16 +10,16 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat ...@@ -10,16 +10,16 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat
actor_rollout_ref.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \ actor_rollout_ref.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \
actor_rollout_ref.actor.optim.lr=2e-6 \ actor_rollout_ref.actor.optim.lr=2e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=64 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.log_prob_micro_batch_size=64 \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \ actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.ref.log_prob_micro_batch_size=128 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
critic.optim.lr=2e-5 \ critic.optim.lr=2e-5 \
critic.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \ critic.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \
critic.model.enable_gradient_checkpointing=False \ critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=64 \ critic.ppo_micro_batch_size_per_gpu=8 \
algorithm.kl_ctrl.kl_coef=0.001 \ algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \ trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \ trainer.logger=['console','wandb'] \
......
...@@ -11,21 +11,21 @@ python3 -m verl.trainer.main_ppo \ ...@@ -11,21 +11,21 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=128 \ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
actor_rollout_ref.actor.ppo_micro_batch_size=4 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=4 \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=4 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \ actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \ critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \ critic.model.use_remove_padding=True \
critic.model.path=google/gemma-2-2b-it \ critic.model.path=google/gemma-2-2b-it \
critic.model.enable_gradient_checkpointing=False \ critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=4 \ critic.ppo_micro_batch_size_per_gpu=4 \
critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \ critic.model.fsdp_config.optimizer_offload=False \
......
...@@ -19,21 +19,22 @@ python3 -m verl.trainer.main_ppo \ ...@@ -19,21 +19,22 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=16 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=16 \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.ref.log_prob_micro_batch_size=16 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \ actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \ critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \ critic.model.use_remove_padding=True \
critic.model.path=Qwen/Qwen2-7B-Instruct \ critic.model.path=Qwen/Qwen2-7B-Instruct \
critic.model.enable_gradient_checkpointing=False \ critic.model.enable_gradient_checkpointing=True \
critic.ppo_micro_batch_size=16 \ critic.ppo_micro_batch_size_per_gpu=32 \
critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \ critic.model.fsdp_config.optimizer_offload=False \
......
...@@ -23,22 +23,22 @@ python3 -m verl.trainer.main_ppo \ ...@@ -23,22 +23,22 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=16 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=16 \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \ actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.ref.log_prob_micro_batch_size=16 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \ actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \ critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \ critic.model.use_remove_padding=True \
critic.optim.lr_warmup_steps_ratio=0.05 \ critic.optim.lr_warmup_steps_ratio=0.05 \
critic.model.path=Qwen/Qwen2-7B-Instruct \ critic.model.path=Qwen/Qwen2-7B-Instruct \
critic.model.enable_gradient_checkpointing=False \ critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=16 \ critic.ppo_micro_batch_size_per_gpu=16 \
critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \ critic.model.fsdp_config.optimizer_offload=False \
...@@ -46,7 +46,7 @@ python3 -m verl.trainer.main_ppo \ ...@@ -46,7 +46,7 @@ python3 -m verl.trainer.main_ppo \
reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\ reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\
reward_model.model.use_remove_padding=True \ reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \ reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size=16 \ reward_model.micro_batch_size_per_gpu=32 \
algorithm.kl_ctrl.kl_coef=0.001 \ algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \ trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \ trainer.logger=['console','wandb'] \
......
...@@ -45,7 +45,7 @@ python3 -m verl.trainer.main_ppo \ ...@@ -45,7 +45,7 @@ python3 -m verl.trainer.main_ppo \
reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\ reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\
reward_model.model.use_remove_padding=True \ reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \ reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size=16 \ reward_model.micro_batch_size_per_gpu=32 \
reward_model.use_dynamic_bsz=True \ reward_model.use_dynamic_bsz=True \
reward_model.forward_max_token_len_per_gpu=98304 \ reward_model.forward_max_token_len_per_gpu=98304 \
algorithm.kl_ctrl.kl_coef=0.001 \ algorithm.kl_ctrl.kl_coef=0.001 \
......
...@@ -20,21 +20,22 @@ python3 -m verl.trainer.main_ppo \ ...@@ -20,21 +20,22 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=16 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \ actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.ref.log_prob_micro_batch_size=128 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \ actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \ critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \ critic.model.use_remove_padding=True \
critic.model.path=Qwen/Qwen2.5-32B-Instruct \ critic.model.path=Qwen/Qwen2.5-32B-Instruct \
critic.model.enable_gradient_checkpointing=False \ critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=32 \ critic.ppo_micro_batch_size_per_gpu=8 \
critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \ critic.model.fsdp_config.optimizer_offload=False \
......
...@@ -646,7 +646,7 @@ ...@@ -646,7 +646,7 @@
"\u001b[36m(main_task pid=28294)\u001b[0m 'path': '/teamspace/studios/this_studio/models/Qwen2.5-0.5B-Instruct'},\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'path': '/teamspace/studios/this_studio/models/Qwen2.5-0.5B-Instruct'},\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'ref': {'fsdp_config': {'param_offload': False,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'ref': {'fsdp_config': {'param_offload': False,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'wrap_policy': {'min_num_params': 0}},\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'wrap_policy': {'min_num_params': 0}},\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'log_prob_micro_batch_size': 4},\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'log_prob_micro_batch_size_per_gpu': 4},\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'rollout': {'do_sample': True,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'rollout': {'do_sample': True,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'dtype': 'bfloat16',\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'dtype': 'bfloat16',\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'enforce_eager': True,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'enforce_eager': True,\n",
...@@ -654,7 +654,7 @@ ...@@ -654,7 +654,7 @@
"\u001b[36m(main_task pid=28294)\u001b[0m 'gpu_memory_utilization': 0.4,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'gpu_memory_utilization': 0.4,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'ignore_eos': False,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'ignore_eos': False,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'load_format': 'dummy_dtensor',\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'load_format': 'dummy_dtensor',\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'log_prob_micro_batch_size': 1,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'log_prob_micro_batch_size_per_gpu': 1,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'max_num_batched_tokens': 8192,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'max_num_batched_tokens': 8192,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'max_num_seqs': 1024,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'max_num_seqs': 1024,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'n': 1,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'n': 1,\n",
...@@ -671,7 +671,7 @@ ...@@ -671,7 +671,7 @@
"\u001b[36m(main_task pid=28294)\u001b[0m 'kl_penalty': 'kl',\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'kl_penalty': 'kl',\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'lam': 1.0},\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'lam': 1.0},\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'critic': {'cliprange_value': 0.5,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'critic': {'cliprange_value': 0.5,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'forward_micro_batch_size': 4,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'forward_micro_batch_size_per_gpu': 4,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'grad_clip': 1.0,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'grad_clip': 1.0,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'model': {'enable_gradient_checkpointing': False,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'model': {'enable_gradient_checkpointing': False,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'external_lib': None,\n", "\u001b[36m(main_task pid=28294)\u001b[0m 'external_lib': None,\n",
...@@ -1110,10 +1110,10 @@ ...@@ -1110,10 +1110,10 @@
" actor_rollout_ref.actor.optim.lr=1e-6 \\\n", " actor_rollout_ref.actor.optim.lr=1e-6 \\\n",
" actor_rollout_ref.actor.ppo_mini_batch_size=64 \\\n", " actor_rollout_ref.actor.ppo_mini_batch_size=64 \\\n",
" actor_rollout_ref.actor.ppo_micro_batch_size=1 \\\n", " actor_rollout_ref.actor.ppo_micro_batch_size=1 \\\n",
" actor_rollout_ref.rollout.log_prob_micro_batch_size=1 \\\n", " actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \\\n",
" actor_rollout_ref.rollout.tensor_model_parallel_size=1 \\\n", " actor_rollout_ref.rollout.tensor_model_parallel_size=1 \\\n",
" actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \\\n", " actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \\\n",
" actor_rollout_ref.ref.log_prob_micro_batch_size=4 \\\n", " actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \\\n",
" critic.optim.lr=1e-5 \\\n", " critic.optim.lr=1e-5 \\\n",
" critic.model.path=$HOME/models/Qwen2.5-0.5B-Instruct \\\n", " critic.model.path=$HOME/models/Qwen2.5-0.5B-Instruct \\\n",
" critic.ppo_micro_batch_size=1 \\\n", " critic.ppo_micro_batch_size=1 \\\n",
......
set -x set -x
hdfs_path=hdfs://user/verl/experiments/gsm8k/deepseek-coder-6.7b-instruct/ # replace to your own hdfs/local path if [ "$#" -lt 2 ]; then
echo "Usage: run_deepseek_6b7.sh <nproc_per_node> <save_path> [other_configs...]"
exit 1
fi
nproc_per_node=$1 nproc_per_node=$1
save_path=$2
# Shift the arguments so $@ refers to the rest
shift 2
torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \ torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
-m verl.trainer.fsdp_sft_trainer \ -m verl.trainer.fsdp_sft_trainer \
data.train_files=$HOME/data/gsm8k/train.parquet \ data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \ data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=prompt \ data.prompt_key=extra_info \
data.response_key=answer \ data.response_key=extra_info \
data.micro_batch_size=8 \ +data.prompt_dict_keys=['question'] \
+data.response_dict_keys=['answer'] \
data.micro_batch_size_per_gpu=4 \
model.partial_pretrain=deepseek-ai/deepseek-coder-6.7b-instruct \ model.partial_pretrain=deepseek-ai/deepseek-coder-6.7b-instruct \
trainer.default_hdfs_dir=$hdfs_path \ trainer.default_local_dir=$save_path \
trainer.project_name=gsm8k-sft \ trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-deepseek-coder-6.7b-instruct \ trainer.experiment_name=gsm8k-sft-deepseek-coder-6.7b-instruct \
trainer.total_epochs=4 \ trainer.total_epochs=4 \
trainer.logger=['console','wandb'] trainer.logger=['console','wandb'] \
\ No newline at end of file trainer.default_hdfs_dir=null $@
\ No newline at end of file
...@@ -21,7 +21,7 @@ torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \ ...@@ -21,7 +21,7 @@ torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
data.response_key=extra_info \ data.response_key=extra_info \
+data.prompt_dict_keys=['question'] \ +data.prompt_dict_keys=['question'] \
+data.response_dict_keys=['answer'] \ +data.response_dict_keys=['answer'] \
data.micro_batch_size=8 \ data.micro_batch_size_per_gpu=4 \
model.partial_pretrain=google/gemma-2b-it \ model.partial_pretrain=google/gemma-2b-it \
trainer.default_local_dir=$save_path \ trainer.default_local_dir=$save_path \
trainer.project_name=gsm8k-sft \ trainer.project_name=gsm8k-sft \
......
set -x set -x
hdfs_path=hdfs://user/verl/experiments/gsm8k/gemma-1.1-7b-it/ # replace to your own hdfs/local path if [ "$#" -lt 2 ]; then
echo "Usage: run_gemma_7b.sh <nproc_per_node> <save_path> [other_configs...]"
exit 1
fi
nproc_per_node=$1 nproc_per_node=$1
save_path=$2
# Shift the arguments so $@ refers to the rest
shift 2
torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \ torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
-m verl.trainer.fsdp_sft_trainer \ -m verl.trainer.fsdp_sft_trainer \
...@@ -10,10 +17,11 @@ torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \ ...@@ -10,10 +17,11 @@ torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
data.val_files=$HOME/data/gsm8k/test.parquet \ data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=prompt \ data.prompt_key=prompt \
data.response_key=answer \ data.response_key=answer \
data.micro_batch_size=8 \ data.micro_batch_size_per_gpu=4 \
model.partial_pretrain=google/gemma-1.1-7b-it \ model.partial_pretrain=google/gemma-1.1-7b-it \
trainer.default_hdfs_dir=$hdfs_path \ trainer.default_local_dir=$save_path \
trainer.project_name=gsm8k-sft \ trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-gemma-1.1-7b-it \ trainer.experiment_name=gsm8k-sft-gemma-1.1-7b-it \
trainer.total_epochs=4 \ trainer.total_epochs=4 \
trainer.logger=['console','wandb'] trainer.logger=['console','wandb'] \
\ No newline at end of file trainer.default_hdfs_dir=null $@
\ No newline at end of file
...@@ -22,13 +22,13 @@ torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \ ...@@ -22,13 +22,13 @@ torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
optim.lr=1e-4 \ optim.lr=1e-4 \
+data.prompt_dict_keys=['question'] \ +data.prompt_dict_keys=['question'] \
+data.response_dict_keys=['answer'] \ +data.response_dict_keys=['answer'] \
data.micro_batch_size=32 \ data.micro_batch_size_per_gpu=4 \
model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \ model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \
trainer.default_local_dir=$save_path \ trainer.default_local_dir=$save_path \
trainer.project_name=gsm8k-sft \ trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-qwen-2.5-0.5b-instruct \ trainer.experiment_name=gsm8k-sft-qwen-2.5-0.5b-instruct \
trainer.logger=['console'] \ trainer.logger=['console'] \
trainer.total_training_steps=1 \ trainer.total_epochs=1 \
trainer.default_hdfs_dir=null $@ \ trainer.default_hdfs_dir=null $@ \
model.lora_rank=32\ model.lora_rank=32\
model.lora_alpha=16 \ model.lora_alpha=16 \
......
...@@ -20,7 +20,8 @@ actor_rollout_ref: ...@@ -20,7 +20,8 @@ actor_rollout_ref:
actor: actor:
strategy: fsdp # This is for backward-compatibility strategy: fsdp # This is for backward-compatibility
ppo_mini_batch_size: 256 ppo_mini_batch_size: 256
ppo_micro_batch_size: 64 ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: 64
grad_clip: 1.0 grad_clip: 1.0
clip_ratio: 0.2 clip_ratio: 0.2
entropy_coeff: 0.001 entropy_coeff: 0.001
...@@ -45,7 +46,8 @@ actor_rollout_ref: ...@@ -45,7 +46,8 @@ actor_rollout_ref:
wrap_policy: wrap_policy:
# transformer_layer_cls_to_wrap: None # transformer_layer_cls_to_wrap: None
min_num_params: 0 min_num_params: 0
log_prob_micro_batch_size: 128 log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 128
rollout: rollout:
name: vllm name: vllm
temperature: 1.0 temperature: 1.0
...@@ -63,7 +65,8 @@ actor_rollout_ref: ...@@ -63,7 +65,8 @@ actor_rollout_ref:
tensor_model_parallel_size: 2 tensor_model_parallel_size: 2
max_num_batched_tokens: 8192 max_num_batched_tokens: 8192
max_num_seqs: 1024 max_num_seqs: 1024
log_prob_micro_batch_size: 128 log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 128
# for hf rollout # for hf rollout
do_sample: True do_sample: True
# number of responses (i.e. num sample times) # number of responses (i.e. num sample times)
...@@ -91,7 +94,8 @@ critic: ...@@ -91,7 +94,8 @@ critic:
# transformer_layer_cls_to_wrap: None # transformer_layer_cls_to_wrap: None
min_num_params: 0 min_num_params: 0
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
ppo_micro_batch_size: 64 ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: 64
ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
shuffle: ${actor_rollout_ref.actor.shuffle} shuffle: ${actor_rollout_ref.actor.shuffle}
grad_clip: 1.0 grad_clip: 1.0
...@@ -107,7 +111,8 @@ reward_model: ...@@ -107,7 +111,8 @@ reward_model:
fsdp_config: fsdp_config:
min_num_params: 0 min_num_params: 0
param_offload: False param_offload: False
micro_batch_size: 64 micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
micro_batch_size_per_gpu: 64
max_length: null max_length: null
algorithm: algorithm:
......
...@@ -10,20 +10,20 @@ python3 main_ppo_split.py \ ...@@ -10,20 +10,20 @@ python3 main_ppo_split.py \
actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \ actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=16 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \ actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \ critic.optim.lr=1e-5 \
critic.model.path=deepseek-ai/deepseek-llm-7b-chat \ critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
critic.model.enable_gradient_checkpointing=False \ critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=16 \ critic.ppo_micro_batch_size_per_gpu=8 \
critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \ critic.model.fsdp_config.optimizer_offload=False \
......
...@@ -22,7 +22,8 @@ actor_rollout_ref: ...@@ -22,7 +22,8 @@ actor_rollout_ref:
actor: actor:
strategy: fsdp # This is for backward-compatibility strategy: fsdp # This is for backward-compatibility
ppo_mini_batch_size: 200 ppo_mini_batch_size: 200
ppo_micro_batch_size: 200 ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: null
use_dynamic_bsz: False use_dynamic_bsz: False
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length} ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
grad_clip: 1.0 grad_clip: 1.0
...@@ -50,7 +51,8 @@ actor_rollout_ref: ...@@ -50,7 +51,8 @@ actor_rollout_ref:
wrap_policy: wrap_policy:
# transformer_layer_cls_to_wrap: None # transformer_layer_cls_to_wrap: None
min_num_params: 0 min_num_params: 0
micro_batch_size: 200 log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: null
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
...@@ -65,14 +67,15 @@ actor_rollout_ref: ...@@ -65,14 +67,15 @@ actor_rollout_ref:
dtype: bfloat16 # should align with FSDP dtype: bfloat16 # should align with FSDP
gpu_memory_utilization: 0.1 gpu_memory_utilization: 0.1
ignore_eos: False ignore_eos: False
micro_batch_size: 200 micro_batch_size_per_gpu: 200
enforce_eager: True enforce_eager: True
free_cache_engine: True free_cache_engine: True
load_format: dummy_dtensor load_format: dummy_dtensor
tensor_model_parallel_size: 1 tensor_model_parallel_size: 1
max_num_batched_tokens: 8192 max_num_batched_tokens: 8192
max_num_seqs: 1024 max_num_seqs: 1024
log_prob_micro_batch_size: 200 log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: null
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
# for hf rollout # for hf rollout
...@@ -80,6 +83,7 @@ actor_rollout_ref: ...@@ -80,6 +83,7 @@ actor_rollout_ref:
# number of responses (i.e. num sample times) # number of responses (i.e. num sample times)
n: 1 # > 1 for grpo n: 1 # > 1 for grpo
critic: critic:
strategy: fsdp strategy: fsdp
optim: optim:
...@@ -100,8 +104,10 @@ critic: ...@@ -100,8 +104,10 @@ critic:
# transformer_layer_cls_to_wrap: None # transformer_layer_cls_to_wrap: None
min_num_params: 0 min_num_params: 0
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
ppo_micro_batch_size: 200 ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: null
forward_micro_batch_size: ${critic.ppo_micro_batch_size} forward_micro_batch_size: ${critic.ppo_micro_batch_size}
forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
...@@ -128,7 +134,8 @@ reward_model: ...@@ -128,7 +134,8 @@ reward_model:
fsdp_config: fsdp_config:
min_num_params: 0 min_num_params: 0
fsdp_size: -1 fsdp_size: -1
micro_batch_size: 8 micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
micro_batch_size_per_gpu: null # set a number
max_length: null max_length: null
ulysses_sequence_parallel_size: 1 # sp size ulysses_sequence_parallel_size: 1 # sp size
......
...@@ -105,14 +105,6 @@ def main(config): ...@@ -105,14 +105,6 @@ def main(config):
from omegaconf import OmegaConf from omegaconf import OmegaConf
pprint(OmegaConf.to_container(config, resolve=True)) # resolve=True will eval symbol values pprint(OmegaConf.to_container(config, resolve=True)) # resolve=True will eval symbol values
dp_size = config.trainer.n_gpus_per_node * config.trainer.nnodes
# normalize batch_size
# TODO: move this inside each role
config.actor_rollout_ref.actor.ppo_mini_batch_size //= dp_size
config.actor_rollout_ref.actor.ppo_micro_batch_size //= dp_size
config.critic.ppo_micro_batch_size //= dp_size
config.actor_rollout_ref.rollout.micro_batch_size //= dp_size
# print the config # print the config
# print initial config # print initial config
print('Config after normalizing batch_size') print('Config after normalizing batch_size')
......
...@@ -13,21 +13,21 @@ python3 -m verl.trainer.main_ppo \ ...@@ -13,21 +13,21 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=32 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=128 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \ actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \ critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \ critic.model.use_remove_padding=True \
critic.model.path=Qwen/Qwen2.5-0.5B \ critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \ critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=32 \ critic.ppo_micro_batch_size_per_gpu=4 \
critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \ critic.model.fsdp_config.optimizer_offload=False \
......
...@@ -13,21 +13,21 @@ python3 -m verl.trainer.main_ppo \ ...@@ -13,21 +13,21 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=False \ actor_rollout_ref.model.use_remove_padding=False \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=32 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=128 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \ actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \ critic.optim.lr=1e-5 \
critic.model.use_remove_padding=False \ critic.model.use_remove_padding=False \
critic.model.path=Qwen/Qwen2.5-0.5B \ critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \ critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=32 \ critic.ppo_micro_batch_size_per_gpu=4 \
critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \ critic.model.fsdp_config.optimizer_offload=False \
......
...@@ -15,22 +15,22 @@ python3 -m verl.trainer.main_ppo \ ...@@ -15,22 +15,22 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=32 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=128 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \ actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \ critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \ critic.model.use_remove_padding=True \
critic.optim.lr_warmup_steps_ratio=0.05 \ critic.optim.lr_warmup_steps_ratio=0.05 \
critic.model.path=Qwen/Qwen2.5-0.5B \ critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \ critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=32 \ critic.ppo_micro_batch_size_per_gpu=4 \
critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \ critic.model.fsdp_config.optimizer_offload=False \
...@@ -38,7 +38,7 @@ python3 -m verl.trainer.main_ppo \ ...@@ -38,7 +38,7 @@ python3 -m verl.trainer.main_ppo \
reward_model.model.path=Qwen/Qwen2.5-0.5B\ reward_model.model.path=Qwen/Qwen2.5-0.5B\
reward_model.model.use_remove_padding=True \ reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \ reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size=16 \ reward_model.micro_batch_size_per_gpu=16 \
algorithm.kl_ctrl.kl_coef=0.001 \ algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \ trainer.critic_warmup=0 \
trainer.logger=['console'] \ trainer.logger=['console'] \
......
...@@ -15,22 +15,22 @@ python3 -m verl.trainer.main_ppo \ ...@@ -15,22 +15,22 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.model.use_remove_padding=False \ actor_rollout_ref.model.use_remove_padding=False \
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=32 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=128 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \ actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \ critic.optim.lr=1e-5 \
critic.model.use_remove_padding=False \ critic.model.use_remove_padding=False \
critic.optim.lr_warmup_steps_ratio=0.05 \ critic.optim.lr_warmup_steps_ratio=0.05 \
critic.model.path=Qwen/Qwen2.5-0.5B \ critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \ critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=32 \ critic.ppo_micro_batch_size_per_gpu=4 \
critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \ critic.model.fsdp_config.optimizer_offload=False \
...@@ -38,7 +38,7 @@ python3 -m verl.trainer.main_ppo \ ...@@ -38,7 +38,7 @@ python3 -m verl.trainer.main_ppo \
reward_model.model.path=Qwen/Qwen2.5-0.5B\ reward_model.model.path=Qwen/Qwen2.5-0.5B\
reward_model.model.use_remove_padding=False \ reward_model.model.use_remove_padding=False \
reward_model.model.fsdp_config.param_offload=True \ reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size=16 \ reward_model.micro_batch_size_per_gpu=16 \
algorithm.kl_ctrl.kl_coef=0.001 \ algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \ trainer.critic_warmup=0 \
+trainer.val_before_train=False \ +trainer.val_before_train=False \
......
...@@ -15,13 +15,13 @@ python3 -m verl.trainer.main_ppo \ ...@@ -15,13 +15,13 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=32 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.use_dynamic_bsz=True \ actor_rollout_ref.actor.use_dynamic_bsz=True \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=12000 \ actor_rollout_ref.actor.ppo_max_token_len_per_gpu=12000 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
...@@ -33,7 +33,7 @@ python3 -m verl.trainer.main_ppo \ ...@@ -33,7 +33,7 @@ python3 -m verl.trainer.main_ppo \
critic.optim.lr_warmup_steps_ratio=0.05 \ critic.optim.lr_warmup_steps_ratio=0.05 \
critic.model.path=Qwen/Qwen2.5-0.5B \ critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \ critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=32 \ critic.ppo_micro_batch_size_per_gpu=4 \
critic.use_dynamic_bsz=True \ critic.use_dynamic_bsz=True \
critic.ppo_max_token_len_per_gpu=98304 \ critic.ppo_max_token_len_per_gpu=98304 \
critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.param_offload=False \
...@@ -43,7 +43,7 @@ python3 -m verl.trainer.main_ppo \ ...@@ -43,7 +43,7 @@ python3 -m verl.trainer.main_ppo \
reward_model.model.path=Qwen/Qwen2.5-0.5B\ reward_model.model.path=Qwen/Qwen2.5-0.5B\
reward_model.model.use_remove_padding=True \ reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \ reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size=16 \ reward_model.micro_batch_size_per_gpu=16 \
reward_model.use_dynamic_bsz=True \ reward_model.use_dynamic_bsz=True \
reward_model.forward_max_token_len_per_gpu=98304 \ reward_model.forward_max_token_len_per_gpu=98304 \
algorithm.kl_ctrl.kl_coef=0.001 \ algorithm.kl_ctrl.kl_coef=0.001 \
......
...@@ -15,17 +15,17 @@ python3 -m verl.trainer.main_ppo \ ...@@ -15,17 +15,17 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \ actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \ actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=32 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.ulysses_sequence_parallel_size=2 \ actor_rollout_ref.actor.ulysses_sequence_parallel_size=2 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \ actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \ actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \ actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \
actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \ actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=128 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \ actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \ critic.optim.lr=1e-5 \
critic.ulysses_sequence_parallel_size=2 \ critic.ulysses_sequence_parallel_size=2 \
...@@ -33,7 +33,7 @@ python3 -m verl.trainer.main_ppo \ ...@@ -33,7 +33,7 @@ python3 -m verl.trainer.main_ppo \
critic.optim.lr_warmup_steps_ratio=0.05 \ critic.optim.lr_warmup_steps_ratio=0.05 \
critic.model.path=Qwen/Qwen2.5-0.5B \ critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \ critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=32 \ critic.ppo_micro_batch_size_per_gpu=4 \
critic.model.fsdp_config.param_offload=False \ critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \ critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \ critic.model.fsdp_config.optimizer_offload=False \
...@@ -43,7 +43,7 @@ python3 -m verl.trainer.main_ppo \ ...@@ -43,7 +43,7 @@ python3 -m verl.trainer.main_ppo \
reward_model.model.path=Qwen/Qwen2.5-0.5B\ reward_model.model.path=Qwen/Qwen2.5-0.5B\
reward_model.model.use_remove_padding=True \ reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \ reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size=16 \ reward_model.micro_batch_size_per_gpu=16 \
algorithm.kl_ctrl.kl_coef=0.001 \ algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \ trainer.critic_warmup=0 \
+trainer.val_before_train=False \ +trainer.val_before_train=False \
......
...@@ -11,6 +11,10 @@ python3 tests/e2e/arithmetic_sequence/rl/main_trainer.py \ ...@@ -11,6 +11,10 @@ python3 tests/e2e/arithmetic_sequence/rl/main_trainer.py \
data.train_files=tests/e2e/arithmetic_sequence/data/train.parquet \ data.train_files=tests/e2e/arithmetic_sequence/data/train.parquet \
data.val_files=tests/e2e/arithmetic_sequence/data/test.parquet \ data.val_files=tests/e2e/arithmetic_sequence/data/test.parquet \
actor_rollout_ref.model.path=tests/e2e/arithmetic_sequence/model \ actor_rollout_ref.model.path=tests/e2e/arithmetic_sequence/model \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
critic.ppo_micro_batch_size_per_gpu=1 \
critic.model.path=tests/e2e/arithmetic_sequence/model | tee $OUTPUT_FILE; critic.model.path=tests/e2e/arithmetic_sequence/model | tee $OUTPUT_FILE;
python3 tests/e2e/check_results.py --output_file=$OUTPUT_FILE python3 tests/e2e/check_results.py --output_file=$OUTPUT_FILE
......
...@@ -10,7 +10,7 @@ torchrun --standalone --nnodes=1 --nproc_per_node=8 \ ...@@ -10,7 +10,7 @@ torchrun --standalone --nnodes=1 --nproc_per_node=8 \
data.response_key=extra_info \ data.response_key=extra_info \
+data.prompt_dict_keys=['question'] \ +data.prompt_dict_keys=['question'] \
+data.response_dict_keys=['answer'] \ +data.response_dict_keys=['answer'] \
data.micro_batch_size=32 \ data.micro_batch_size_per_gpu=32 \
model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \ model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \
trainer.default_local_dir=$HOME/ckpts/ \ trainer.default_local_dir=$HOME/ckpts/ \
trainer.project_name=qwen2.5-sft \ trainer.project_name=qwen2.5-sft \
......
# Tested with 2 & 4 GPUs
set -x
if [ "$#" -lt 2 ]; then
echo "Usage: run_qwen_05_peft.sh <nproc_per_node> <save_path> [other_configs...]"
exit 1
fi
nproc_per_node=$1
save_path=$2
# Shift the arguments so $@ refers to the rest
shift 2
torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
-m verl.trainer.fsdp_sft_trainer \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=extra_info \
data.response_key=extra_info \
optim.lr=1e-4 \
+data.prompt_dict_keys=['question'] \
+data.response_dict_keys=['answer'] \
data.micro_batch_size_per_gpu=4 \
model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \
trainer.default_local_dir=$save_path \
trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-qwen-2.5-0.5b-instruct \
trainer.logger=['console'] \
trainer.total_training_steps=1 \
trainer.default_hdfs_dir=null $@ \
model.lora_rank=32\
model.lora_alpha=16 \
model.target_modules=all-linear
# Or you can do this:
# model.target_modules=[q_proj,v_proj] \
...@@ -23,13 +23,13 @@ rollout: ...@@ -23,13 +23,13 @@ rollout:
dtype: bfloat16 # should align with FSDP dtype: bfloat16 # should align with FSDP
gpu_memory_utilization: 0.5 gpu_memory_utilization: 0.5
ignore_eos: False ignore_eos: False
micro_batch_size: 256
enforce_eager: True enforce_eager: True
free_cache_engine: True free_cache_engine: True
load_format: dummy_dtensor load_format: dummy_dtensor
tensor_model_parallel_size: 1 tensor_model_parallel_size: 1
max_num_batched_tokens: 8192 max_num_batched_tokens: 8192
max_num_seqs: 1024 max_num_seqs: 1024
log_prob_micro_batch_size: 8 log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 8
# for hf rollout # for hf rollout
do_sample: True do_sample: True
\ No newline at end of file
...@@ -20,7 +20,9 @@ actor_rollout_ref: ...@@ -20,7 +20,9 @@ actor_rollout_ref:
actor: actor:
strategy: megatron # This is for backward-compatibility strategy: megatron # This is for backward-compatibility
ppo_mini_batch_size: 256 ppo_mini_batch_size: 256
ppo_micro_batch_size: 64 ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: null
use_dynamic_bsz: False
clip_ratio: 0.2 clip_ratio: 0.2
entropy_coeff: 0.001 entropy_coeff: 0.001
ppo_epochs: 1 ppo_epochs: 1
...@@ -48,7 +50,8 @@ actor_rollout_ref: ...@@ -48,7 +50,8 @@ actor_rollout_ref:
seed: 1 seed: 1
load_weight: True load_weight: True
param_offload: False param_offload: False
log_prob_micro_batch_size: 32 log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: null
rollout: rollout:
name: vllm name: vllm
temperature: 1.0 temperature: 1.0
...@@ -66,7 +69,8 @@ actor_rollout_ref: ...@@ -66,7 +69,8 @@ actor_rollout_ref:
tensor_model_parallel_size: 2 tensor_model_parallel_size: 2
max_num_batched_tokens: 8192 max_num_batched_tokens: 8192
max_num_seqs: 1024 max_num_seqs: 1024
log_prob_micro_batch_size: 2 log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: null
# for hf rollout # for hf rollout
do_sample: True do_sample: True
layer_name_map: layer_name_map:
...@@ -98,7 +102,9 @@ critic: ...@@ -98,7 +102,9 @@ critic:
seed: 1 seed: 1
load_weight: True load_weight: True
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
ppo_micro_batch_size: 2 ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: null
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
shuffle: ${actor_rollout_ref.actor.shuffle} shuffle: ${actor_rollout_ref.actor.shuffle}
cliprange_value: 0.5 cliprange_value: 0.5
...@@ -121,7 +127,9 @@ reward_model: ...@@ -121,7 +127,9 @@ reward_model:
external_lib: ${actor_rollout_ref.model.external_lib} external_lib: ${actor_rollout_ref.model.external_lib}
load_weight: True load_weight: True
param_offload: False param_offload: False
micro_batch_size: 64 micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
micro_batch_size_per_gpu: null
use_dynamic_bsz: ${critic.use_dynamic_bsz}
max_length: null max_length: null
algorithm: algorithm:
......
...@@ -21,7 +21,8 @@ actor_rollout_ref: ...@@ -21,7 +21,8 @@ actor_rollout_ref:
actor: actor:
strategy: fsdp # This is for backward-compatibility strategy: fsdp # This is for backward-compatibility
ppo_mini_batch_size: 256 ppo_mini_batch_size: 256
ppo_micro_batch_size: 64 ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: null
use_dynamic_bsz: False use_dynamic_bsz: False
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length} ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
grad_clip: 1.0 grad_clip: 1.0
...@@ -53,7 +54,8 @@ actor_rollout_ref: ...@@ -53,7 +54,8 @@ actor_rollout_ref:
wrap_policy: wrap_policy:
# transformer_layer_cls_to_wrap: None # transformer_layer_cls_to_wrap: None
min_num_params: 0 min_num_params: 0
log_prob_micro_batch_size: 128 log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: null
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
...@@ -74,7 +76,8 @@ actor_rollout_ref: ...@@ -74,7 +76,8 @@ actor_rollout_ref:
tensor_model_parallel_size: 2 tensor_model_parallel_size: 2
max_num_batched_tokens: 8192 max_num_batched_tokens: 8192
max_num_seqs: 1024 max_num_seqs: 1024
log_prob_micro_batch_size: 128 log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: null
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
# for hf rollout # for hf rollout
...@@ -106,8 +109,10 @@ critic: ...@@ -106,8 +109,10 @@ critic:
min_num_params: 0 min_num_params: 0
fsdp_size: -1 fsdp_size: -1
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
ppo_micro_batch_size: 64 ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: null
forward_micro_batch_size: ${critic.ppo_micro_batch_size} forward_micro_batch_size: ${critic.ppo_micro_batch_size}
forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz} use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2 ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu} forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
...@@ -129,7 +134,8 @@ reward_model: ...@@ -129,7 +134,8 @@ reward_model:
min_num_params: 0 min_num_params: 0
param_offload: False param_offload: False
fsdp_size: -1 fsdp_size: -1
micro_batch_size: 64 micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
micro_batch_size_per_gpu: null # set a number
max_length: null max_length: null
ulysses_sequence_parallel_size: 1 # sp size ulysses_sequence_parallel_size: 1 # sp size
use_dynamic_bsz: ${critic.use_dynamic_bsz} use_dynamic_bsz: ${critic.use_dynamic_bsz}
......
data: data:
train_batch_size: 256 train_batch_size: 256
micro_batch_size: 16 # this is also val batch size micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
micro_batch_size_per_gpu: 4 # this is also val batch size
train_files: ~/data/gsm8k/train.parquet train_files: ~/data/gsm8k/train.parquet
val_files: ~/data/gsm8k/test.parquet val_files: ~/data/gsm8k/test.parquet
prompt_key: question prompt_key: question
......
...@@ -97,10 +97,10 @@ class FSDPSFTTrainer(object): ...@@ -97,10 +97,10 @@ class FSDPSFTTrainer(object):
print(f'Normalize batch size by dp {dp_size}') print(f'Normalize batch size by dp {dp_size}')
assert self.config.data.train_batch_size % dp_size == 0 assert self.config.data.train_batch_size % dp_size == 0
assert self.config.data.micro_batch_size % dp_size == 0
self.config.data.train_batch_size //= dp_size self.config.data.train_batch_size //= dp_size
self.config.data.micro_batch_size //= dp_size
assert self.config.data.train_batch_size % self.config.data.micro_batch_size_per_gpu == 0
def _build_dataloader(self): def _build_dataloader(self):
config = self.config config = self.config
...@@ -143,7 +143,7 @@ class FSDPSFTTrainer(object): ...@@ -143,7 +143,7 @@ class FSDPSFTTrainer(object):
rank=rank, rank=rank,
drop_last=True) drop_last=True)
self.val_dataloader = DataLoader(dataset=self.val_dataset, self.val_dataloader = DataLoader(dataset=self.val_dataset,
batch_size=config.data.micro_batch_size, batch_size=config.data.micro_batch_size_per_gpu,
sampler=self.val_sampler, sampler=self.val_sampler,
num_workers=8, num_workers=8,
pin_memory=True, pin_memory=True,
...@@ -285,7 +285,7 @@ class FSDPSFTTrainer(object): ...@@ -285,7 +285,7 @@ class FSDPSFTTrainer(object):
log_gpu_memory_usage('After optimizer zero_grad', logger=logger) log_gpu_memory_usage('After optimizer zero_grad', logger=logger)
micro_batches = batch.split(self.config.data.micro_batch_size) micro_batches = batch.split(self.config.data.micro_batch_size_per_gpu)
n_micro_batches = len(micro_batches) n_micro_batches = len(micro_batches)
step_loss = 0 step_loss = 0
for micro_batch in micro_batches: for micro_batch in micro_batches:
...@@ -373,7 +373,7 @@ class FSDPSFTTrainer(object): ...@@ -373,7 +373,7 @@ class FSDPSFTTrainer(object):
# Perform final validation # Perform final validation
val_losses = [] val_losses = []
for val_data in self.val_dataloader: for val_data in self.val_dataloader:
val_data = TensorDict(val_data, batch_size=self.config.data.micro_batch_size).cuda() val_data = TensorDict(val_data, batch_size=self.config.data.micro_batch_size_per_gpu).cuda()
val_loss = self.validation_step(val_data) val_loss = self.validation_step(val_data)
val_losses.append(val_loss) val_losses.append(val_loss)
if rank == 0: if rank == 0:
...@@ -389,7 +389,7 @@ class FSDPSFTTrainer(object): ...@@ -389,7 +389,7 @@ class FSDPSFTTrainer(object):
# validation # validation
val_losses = [] val_losses = []
for data in self.val_dataloader: for data in self.val_dataloader:
data = TensorDict(data, batch_size=self.config.data.micro_batch_size).cuda() data = TensorDict(data, batch_size=self.config.data.micro_batch_size_per_gpu).cuda()
val_loss = self.validation_step(data) val_loss = self.validation_step(data)
val_losses.append(val_loss) val_losses.append(val_loss)
if rank == 0: if rank == 0:
......
...@@ -337,8 +337,13 @@ class RayPPOTrainer(object): ...@@ -337,8 +337,13 @@ class RayPPOTrainer(object):
else: else:
self.kl_ctrl = core_algos.FixedKLController(kl_coef=0.) self.kl_ctrl = core_algos.FixedKLController(kl_coef=0.)
self._validate_config()
self._create_dataloader() self._create_dataloader()
def _validate_config(self):
from verl.utils.config import validate_config
validate_config(self.config)
def _create_dataloader(self): def _create_dataloader(self):
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
# TODO: we have to make sure the batch size is divisible by the dp size # TODO: we have to make sure the batch size is divisible by the dp size
......
...@@ -21,3 +21,69 @@ def update_dict_with_config(dictionary: Dict, config: DictConfig): ...@@ -21,3 +21,69 @@ def update_dict_with_config(dictionary: Dict, config: DictConfig):
for key in dictionary: for key in dictionary:
if hasattr(config, key): if hasattr(config, key):
dictionary[key] = getattr(config, key) dictionary[key] = getattr(config, key)
def validate_config(config):
# number of GPUs total
n_gpus = config.trainer.n_gpus_per_node * config.trainer.nnodes
# 1. Check total batch size for data correctness
real_train_batch_size = config.data.train_batch_size * config.actor_rollout_ref.rollout.n
assert real_train_batch_size % n_gpus == 0, \
f"real_train_batch_size ({real_train_batch_size}) must be divisible by total n_gpus ({n_gpus})."
# A helper function to check "micro_batch_size" vs "micro_batch_size_per_gpu"
# We throw an error if the user sets both. The new convention is "..._micro_batch_size_per_gpu".
def check_mutually_exclusive(mbs, mbs_per_gpu, name: str):
if mbs is None and mbs_per_gpu is None:
raise ValueError(f"[{name}] Please set at least one of '{name}.micro_batch_size' or "
f"'{name}.micro_batch_size_per_gpu'.")
if mbs is not None and mbs_per_gpu is not None:
raise ValueError(f"[{name}] You have set both '{name}.micro_batch_size' AND "
f"'{name}.micro_batch_size_per_gpu'. Please remove '{name}.micro_batch_size' "
f"because only '*_micro_batch_size_per_gpu' is supported (the former is deprecated).")
if not config.actor_rollout_ref.actor.use_dynamic_bsz:
# actor: ppo_micro_batch_size vs. ppo_micro_batch_size_per_gpu
check_mutually_exclusive(config.actor_rollout_ref.actor.ppo_micro_batch_size,
config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu, "actor_rollout_ref.actor")
# reference: log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
check_mutually_exclusive(config.actor_rollout_ref.ref.log_prob_micro_batch_size,
config.actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu,
"actor_rollout_ref.ref")
# The rollout section also has log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
check_mutually_exclusive(config.actor_rollout_ref.rollout.log_prob_micro_batch_size,
config.actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu,
"actor_rollout_ref.rollout")
if not config.critic.use_dynamic_bsz:
# Check for critic micro-batch size conflicts
check_mutually_exclusive(config.critic.ppo_micro_batch_size, config.critic.ppo_micro_batch_size_per_gpu,
"critic")
# Check for reward model micro-batch size conflicts
if config.reward_model.enable and not config.reward_model.use_dynamic_bsz:
check_mutually_exclusive(config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu,
"reward_model")
# Actor
# if NOT dynamic_bsz, we must ensure:
# ppo_mini_batch_size is divisible by ppo_micro_batch_size
# ppo_micro_batch_size * sequence_parallel_size >= n_gpus
if not config.actor_rollout_ref.actor.use_dynamic_bsz:
sp_size = config.actor_rollout_ref.actor.ulysses_sequence_parallel_size
if config.actor_rollout_ref.actor.ppo_micro_batch_size is not None:
assert config.actor_rollout_ref.actor.ppo_mini_batch_size % config.actor_rollout_ref.actor.ppo_micro_batch_size == 0
assert config.actor_rollout_ref.actor.ppo_micro_batch_size * sp_size >= n_gpus
# critic
if not config.critic.use_dynamic_bsz:
sp_size = config.critic.ulysses_sequence_parallel_size
if config.critic.ppo_micro_batch_size is not None:
assert config.critic.ppo_mini_batch_size % config.critic.ppo_micro_batch_size == 0
assert config.critic.ppo_micro_batch_size * sp_size >= n_gpus
print("[validate_config] All configuration checks passed successfully!")
...@@ -204,8 +204,8 @@ class DataParallelPPOActor(BasePPOActor): ...@@ -204,8 +204,8 @@ class DataParallelPPOActor(BasePPOActor):
# make sure we are in training mode # make sure we are in training mode
self.actor_module.train() self.actor_module.train()
assert self.config.ppo_mini_batch_size % self.config.ppo_micro_batch_size == 0 assert self.config.ppo_mini_batch_size % self.config.ppo_micro_batch_size_per_gpu == 0
self.gradient_accumulation = self.config.ppo_mini_batch_size // self.config.ppo_micro_batch_size self.gradient_accumulation = self.config.ppo_mini_batch_size // self.config.ppo_micro_batch_size_per_gpu
temperature = data.meta_info['temperature'] # temperature must be in the data.meta_info to avoid slient error temperature = data.meta_info['temperature'] # temperature must be in the data.meta_info to avoid slient error
select_keys = ['responses', 'input_ids', 'attention_mask', 'position_ids', 'old_log_probs', 'advantages'] select_keys = ['responses', 'input_ids', 'attention_mask', 'position_ids', 'old_log_probs', 'advantages']
...@@ -226,7 +226,7 @@ class DataParallelPPOActor(BasePPOActor): ...@@ -226,7 +226,7 @@ class DataParallelPPOActor(BasePPOActor):
micro_batches, _ = rearrange_micro_batches(batch=mini_batch, max_token_len=max_token_len) micro_batches, _ = rearrange_micro_batches(batch=mini_batch, max_token_len=max_token_len)
else: else:
# split batch into micro_batches # split batch into micro_batches
micro_batches = mini_batch.split(self.config.ppo_micro_batch_size) micro_batches = mini_batch.split(self.config.ppo_micro_batch_size_per_gpu)
self.actor_optimizer.zero_grad() self.actor_optimizer.zero_grad()
......
...@@ -54,7 +54,7 @@ class MegatronPPOActor(BasePPOActor): ...@@ -54,7 +54,7 @@ class MegatronPPOActor(BasePPOActor):
Args: Args:
config (OmegaConf): the basic config that contains the hyper-parameters of PPO Actor. It must contain config (OmegaConf): the basic config that contains the hyper-parameters of PPO Actor. It must contain
``ppo_micro_batch_size``: minibatch size when updating ppo. ``ppo_micro_batch_size_per_gpu``: micro batch size when updating ppo.
``ppo_mini_batch_size``: minibatch size when updating ppo using the batch data. ``ppo_mini_batch_size``: minibatch size when updating ppo using the batch data.
...@@ -232,7 +232,7 @@ class MegatronPPOActor(BasePPOActor): ...@@ -232,7 +232,7 @@ class MegatronPPOActor(BasePPOActor):
if data.meta_info.get('micro_batch_size', None) is not None: if data.meta_info.get('micro_batch_size', None) is not None:
batch_size = data.meta_info['micro_batch_size'] batch_size = data.meta_info['micro_batch_size']
else: else:
batch_size = self.config.ppo_micro_batch_size batch_size = self.config.ppo_micro_batch_size_per_gpu
batches = split_dict_tensor_into_batches(data.batch, batch_size=batch_size) batches = split_dict_tensor_into_batches(data.batch, batch_size=batch_size)
# compute input shapes for pp stages # compute input shapes for pp stages
input_shapes = compute_transformers_input_shapes( input_shapes = compute_transformers_input_shapes(
......
...@@ -45,8 +45,8 @@ class DataParallelPPOCritic(BasePPOCritic): ...@@ -45,8 +45,8 @@ class DataParallelPPOCritic(BasePPOCritic):
self.use_remove_padding = self.config.model.get('use_remove_padding', False) self.use_remove_padding = self.config.model.get('use_remove_padding', False)
print(f'Critic use_remove_padding={self.use_remove_padding}') print(f'Critic use_remove_padding={self.use_remove_padding}')
assert self.config.ppo_mini_batch_size % self.config.ppo_micro_batch_size == 0 assert self.config.ppo_mini_batch_size % self.config.ppo_micro_batch_size_per_gpu == 0
self.gradient_accumulation = self.config.ppo_mini_batch_size // self.config.ppo_micro_batch_size self.gradient_accumulation = self.config.ppo_mini_batch_size // self.config.ppo_micro_batch_size_per_gpu
self.ulysses_sequence_parallel_size = self.config.get('ulysses_sequence_parallel_size', 1) self.ulysses_sequence_parallel_size = self.config.get('ulysses_sequence_parallel_size', 1)
...@@ -161,7 +161,7 @@ class DataParallelPPOCritic(BasePPOCritic): ...@@ -161,7 +161,7 @@ class DataParallelPPOCritic(BasePPOCritic):
max_token_len = self.config.ppo_max_token_len_per_gpu * self.ulysses_sequence_parallel_size max_token_len = self.config.ppo_max_token_len_per_gpu * self.ulysses_sequence_parallel_size
micro_batches, _ = rearrange_micro_batches(batch=mini_batch, max_token_len=max_token_len) micro_batches, _ = rearrange_micro_batches(batch=mini_batch, max_token_len=max_token_len)
else: else:
micro_batches = mini_batch.split(self.config.ppo_micro_batch_size) micro_batches = mini_batch.split(self.config.ppo_micro_batch_size_per_gpu)
self.critic_optimizer.zero_grad() self.critic_optimizer.zero_grad()
......
...@@ -118,7 +118,7 @@ class MegatronPPOCritic(BasePPOCritic): ...@@ -118,7 +118,7 @@ class MegatronPPOCritic(BasePPOCritic):
group=mpu.get_pipeline_model_parallel_group()) group=mpu.get_pipeline_model_parallel_group())
# split into micro-batches # split into micro-batches
data.batch['attention_mask'] = data.batch['attention_mask'].to(bool) data.batch['attention_mask'] = data.batch['attention_mask'].to(bool)
batches = split_dict_tensor_into_batches(data.batch, batch_size=self.config.ppo_micro_batch_size) batches = split_dict_tensor_into_batches(data.batch, batch_size=self.config.ppo_micro_batch_size_per_gpu)
n_micro_batch = len(batches) n_micro_batch = len(batches)
seq_len = batches[0]['input_ids'].shape[1] seq_len = batches[0]['input_ids'].shape[1]
...@@ -182,7 +182,7 @@ class MegatronPPOCritic(BasePPOCritic): ...@@ -182,7 +182,7 @@ class MegatronPPOCritic(BasePPOCritic):
model=self.critic_module, model=self.critic_module,
num_microbatches=n_micro_batch, num_microbatches=n_micro_batch,
input_shapes=input_shapes, # must set for flash-attn sequence packing input_shapes=input_shapes, # must set for flash-attn sequence packing
seq_length=self.config.ppo_micro_batch_size * seq_len, # no use when input_shapes was set seq_length=self.config.ppo_micro_batch_size_per_gpu * seq_len, # no use when input_shapes was set
hidden_size=self.model_config.hidden_size, # no use when input_shapes was set hidden_size=self.model_config.hidden_size, # no use when input_shapes was set
micro_batch_size=1, # no use when input_shapes was set micro_batch_size=1, # no use when input_shapes was set
forward_only=forward_only, forward_only=forward_only,
...@@ -193,7 +193,7 @@ class MegatronPPOCritic(BasePPOCritic): ...@@ -193,7 +193,7 @@ class MegatronPPOCritic(BasePPOCritic):
data_iterator=batch_generator, data_iterator=batch_generator,
model=self.critic_module, model=self.critic_module,
num_microbatches=n_micro_batch, num_microbatches=n_micro_batch,
seq_length=self.config.ppo_micro_batch_size * seq_len, # in use for pp = 1 seq_length=self.config.ppo_micro_batch_size_per_gpu * seq_len, # in use for pp = 1
hidden_size=self.model_config.hidden_size, # in use for pp = 1 hidden_size=self.model_config.hidden_size, # in use for pp = 1
micro_batch_size=1, # in use for pp = 1 micro_batch_size=1, # in use for pp = 1
forward_only=forward_only, forward_only=forward_only,
......
...@@ -119,18 +119,22 @@ class ActorRolloutRefWorker(Worker): ...@@ -119,18 +119,22 @@ class ActorRolloutRefWorker(Worker):
# normalize config # normalize config
if self._is_actor: if self._is_actor:
self.config.actor.ppo_mini_batch_size //= (self.device_mesh.shape[0] // self.ulysses_sequence_parallel_size) self.config.actor.ppo_mini_batch_size //= (self.device_mesh.shape[0] // self.ulysses_sequence_parallel_size)
self.config.actor.ppo_micro_batch_size //= (self.device_mesh.shape[0] //
self.ulysses_sequence_parallel_size)
self.config.actor.ppo_mini_batch_size *= self.config.rollout.n self.config.actor.ppo_mini_batch_size *= self.config.rollout.n
self.config.actor.ppo_micro_batch_size *= self.config.rollout.n # micro bsz
if self._is_rollout: if self.config.actor.ppo_micro_batch_size is not None:
self.config.actor.ppo_micro_batch_size //= (self.device_mesh.shape[0] //
self.ulysses_sequence_parallel_size)
self.config.actor.ppo_micro_batch_size_per_gpu = self.config.actor.ppo_micro_batch_size
# normalize rollout config
if self._is_rollout and self.config.rollout.log_prob_micro_batch_size is not None:
self.config.rollout.log_prob_micro_batch_size //= (self.device_mesh.shape[0] // self.config.rollout.log_prob_micro_batch_size //= (self.device_mesh.shape[0] //
self.ulysses_sequence_parallel_size) self.ulysses_sequence_parallel_size)
self.config.rollout.log_prob_micro_batch_size *= self.config.rollout.n self.config.rollout.log_prob_micro_batch_size_per_gpu = self.config.rollout.log_prob_micro_batch_size
if self._is_ref: # normalize ref config
if self._is_ref and self.config.ref.log_prob_micro_batch_size is not None:
self.config.ref.log_prob_micro_batch_size //= (self.device_mesh.shape[0] // self.config.ref.log_prob_micro_batch_size //= (self.device_mesh.shape[0] //
self.ulysses_sequence_parallel_size) self.ulysses_sequence_parallel_size)
self.config.ref.log_prob_micro_batch_size *= self.config.rollout.n self.config.ref.log_prob_micro_batch_size_per_gpu = self.config.ref.log_prob_micro_batch_size
def _build_model_optimizer(self, def _build_model_optimizer(self,
model_path, model_path,
...@@ -424,8 +428,6 @@ class ActorRolloutRefWorker(Worker): ...@@ -424,8 +428,6 @@ class ActorRolloutRefWorker(Worker):
@register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO) @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
def generate_sequences(self, prompts: DataProto): def generate_sequences(self, prompts: DataProto):
prompts = prompts.to('cuda') prompts = prompts.to('cuda')
# set to False if it is validation
recompute_log_prob = prompts.meta_info.get('recompute_log_prob', True)
assert self._is_rollout assert self._is_rollout
if self._is_offload_param: if self._is_offload_param:
...@@ -461,7 +463,7 @@ class ActorRolloutRefWorker(Worker): ...@@ -461,7 +463,7 @@ class ActorRolloutRefWorker(Worker):
assert self._is_actor assert self._is_actor
data = data.to('cuda') data = data.to('cuda')
# we should always recompute old_log_probs when it is HybridEngine # we should always recompute old_log_probs when it is HybridEngine
data.meta_info['micro_batch_size'] = self.config.rollout.log_prob_micro_batch_size data.meta_info['micro_batch_size'] = self.config.rollout.log_prob_micro_batch_size_per_gpu
data.meta_info['max_token_len'] = self.config.rollout.log_prob_max_token_len_per_gpu data.meta_info['max_token_len'] = self.config.rollout.log_prob_max_token_len_per_gpu
data.meta_info['use_dynamic_bsz'] = self.config.rollout.log_prob_use_dynamic_bsz data.meta_info['use_dynamic_bsz'] = self.config.rollout.log_prob_use_dynamic_bsz
data.meta_info['temperature'] = self.config.rollout.temperature data.meta_info['temperature'] = self.config.rollout.temperature
...@@ -489,7 +491,7 @@ class ActorRolloutRefWorker(Worker): ...@@ -489,7 +491,7 @@ class ActorRolloutRefWorker(Worker):
data = data.to('cuda') data = data.to('cuda')
micro_batch_size = self.config.ref.log_prob_micro_batch_size micro_batch_size = self.config.ref.log_prob_micro_batch_size_per_gpu
data.meta_info['micro_batch_size'] = micro_batch_size data.meta_info['micro_batch_size'] = micro_batch_size
data.meta_info['temperature'] = self.config.rollout.temperature data.meta_info['temperature'] = self.config.rollout.temperature
data.meta_info['max_token_len'] = self.config.ref.log_prob_max_token_len_per_gpu data.meta_info['max_token_len'] = self.config.ref.log_prob_max_token_len_per_gpu
...@@ -573,9 +575,13 @@ class CriticWorker(Worker): ...@@ -573,9 +575,13 @@ class CriticWorker(Worker):
# normalize config # normalize config
self.config.ppo_mini_batch_size //= (torch.distributed.get_world_size() // self.ulysses_sequence_parallel_size) self.config.ppo_mini_batch_size //= (torch.distributed.get_world_size() // self.ulysses_sequence_parallel_size)
self.config.ppo_micro_batch_size //= (torch.distributed.get_world_size() // self.ulysses_sequence_parallel_size) if self.config.ppo_micro_batch_size is not None:
self.config.forward_micro_batch_size //= (torch.distributed.get_world_size() // self.config.ppo_micro_batch_size //= (torch.distributed.get_world_size() //
self.ulysses_sequence_parallel_size) self.ulysses_sequence_parallel_size)
self.config.forward_micro_batch_size //= (torch.distributed.get_world_size() //
self.ulysses_sequence_parallel_size)
self.config.ppo_micro_batch_size_per_gpu = self.config.ppo_micro_batch_size
self.config.forward_micro_batch_size_per_gpu = self.config.forward_micro_batch_size
def _build_critic_model_optimizer(self, config): def _build_critic_model_optimizer(self, config):
# the following line is necessary # the following line is necessary
...@@ -724,7 +730,7 @@ class CriticWorker(Worker): ...@@ -724,7 +730,7 @@ class CriticWorker(Worker):
load_fsdp_param_and_grad(module=self.critic_module, load_fsdp_param_and_grad(module=self.critic_module,
device_id=torch.cuda.current_device(), device_id=torch.cuda.current_device(),
load_grad=self._is_offload_grad) load_grad=self._is_offload_grad)
micro_batch_size = self.config.forward_micro_batch_size micro_batch_size = self.config.forward_micro_batch_size_per_gpu
data.meta_info['micro_batch_size'] = micro_batch_size data.meta_info['micro_batch_size'] = micro_batch_size
data.meta_info['max_token_len'] = self.config.forward_max_token_len_per_gpu data.meta_info['max_token_len'] = self.config.forward_max_token_len_per_gpu
data.meta_info['use_dynamic_bsz'] = self.config.use_dynamic_bsz data.meta_info['use_dynamic_bsz'] = self.config.use_dynamic_bsz
...@@ -838,7 +844,11 @@ class RewardModelWorker(Worker): ...@@ -838,7 +844,11 @@ class RewardModelWorker(Worker):
self.ulysses_sharding_manager = FSDPUlyssesShardingManager(self.ulysses_device_mesh) self.ulysses_sharding_manager = FSDPUlyssesShardingManager(self.ulysses_device_mesh)
self.use_remove_padding = self.config.model.get('use_remove_padding', False) self.use_remove_padding = self.config.model.get('use_remove_padding', False)
self.config.micro_batch_size //= torch.distributed.get_world_size()
# normalize config
if self.config.micro_batch_size is not None:
self.config.micro_batch_size //= torch.distributed.get_world_size()
self.config.micro_batch_size_per_gpu = self.config.micro_batch_size
def _build_model(self, config): def _build_model(self, config):
# the following line is necessary # the following line is necessary
...@@ -1054,7 +1064,7 @@ class RewardModelWorker(Worker): ...@@ -1054,7 +1064,7 @@ class RewardModelWorker(Worker):
max_token_len = self.config.forward_max_token_len_per_gpu * self.ulysses_sequence_parallel_size max_token_len = self.config.forward_max_token_len_per_gpu * self.ulysses_sequence_parallel_size
micro_batches, indices = rearrange_micro_batches(batch=rm_data.batch, max_token_len=max_token_len) micro_batches, indices = rearrange_micro_batches(batch=rm_data.batch, max_token_len=max_token_len)
else: else:
micro_batches = rm_data.batch.split(self.config.micro_batch_size) micro_batches = rm_data.batch.split(self.config.micro_batch_size_per_gpu)
output = [] output = []
for micro_batch in micro_batches: for micro_batch in micro_batches:
rm_score = self._forward_micro_batch(micro_batch) rm_score = self._forward_micro_batch(micro_batch)
......
...@@ -112,13 +112,19 @@ class ActorRolloutRefWorker(MegatronWorker): ...@@ -112,13 +112,19 @@ class ActorRolloutRefWorker(MegatronWorker):
# normalize config # normalize config
if self._is_actor and self._is_rollout: if self._is_actor and self._is_rollout:
self.config.actor.ppo_mini_batch_size //= mpu.get_data_parallel_world_size() self.config.actor.ppo_mini_batch_size //= mpu.get_data_parallel_world_size()
self.config.actor.ppo_micro_batch_size //= mpu.get_data_parallel_world_size() if self.config.actor.ppo_micro_batch_size is not None:
self.config.rollout.log_prob_micro_batch_size //= mpu.get_data_parallel_world_size() self.config.actor.ppo_micro_batch_size //= mpu.get_data_parallel_world_size()
self.config.rollout.log_prob_micro_batch_size //= mpu.get_data_parallel_world_size()
self.config.actor.ppo_micro_batch_size_per_gpu = self.config.actor.ppo_micro_batch_size
self.config.rollout.log_prob_micro_batch_size_per_gpu = self.config.rollout.log_prob_micro_batch_size
self._is_offload_param = self.config.actor.get('param_offload', False) self._is_offload_param = self.config.actor.get('param_offload', False)
self._is_offload_grad = self.config.actor.get('grad_offload', False) self._is_offload_grad = self.config.actor.get('grad_offload', False)
self._is_offload_optimizer = self.config.actor.get('optimizer_offload', False) self._is_offload_optimizer = self.config.actor.get('optimizer_offload', False)
elif self._is_ref: elif self._is_ref:
self.config.ref.log_prob_micro_batch_size //= mpu.get_data_parallel_world_size() if self.config.ref.ppo_micro_batch_size is not None:
self.config.ref.log_prob_micro_batch_size //= mpu.get_data_parallel_world_size()
self.config.ref.ppo_micro_batch_size_per_gpu = self.config.ref.ppo_micro_batch_size
self._is_offload_param = self.config.ref.get('param_offload', False) self._is_offload_param = self.config.ref.get('param_offload', False)
def _build_model_optimizer(self, def _build_model_optimizer(self,
...@@ -361,7 +367,7 @@ class ActorRolloutRefWorker(MegatronWorker): ...@@ -361,7 +367,7 @@ class ActorRolloutRefWorker(MegatronWorker):
validate = prompts.meta_info.get('validate', False) validate = prompts.meta_info.get('validate', False)
if self._is_actor and not validate: if self._is_actor and not validate:
# we should always recompute old_log_probs when it is HybridEngine # we should always recompute old_log_probs when it is HybridEngine
output.meta_info['micro_batch_size'] = self.config.rollout.log_prob_micro_batch_size output.meta_info['micro_batch_size'] = self.config.rollout.log_prob_micro_batch_size_per_gpu
output.meta_info['temperature'] = self.config.rollout.temperature output.meta_info['temperature'] = self.config.rollout.temperature
old_log_probs = self.actor.compute_log_prob(data=output) old_log_probs = self.actor.compute_log_prob(data=output)
output.batch['old_log_probs'] = old_log_probs output.batch['old_log_probs'] = old_log_probs
...@@ -380,7 +386,7 @@ class ActorRolloutRefWorker(MegatronWorker): ...@@ -380,7 +386,7 @@ class ActorRolloutRefWorker(MegatronWorker):
if self._is_offload_param: if self._is_offload_param:
load_megatron_param_and_grad(self.ref_module, torch.cuda.current_device(), self._is_offload_grad) load_megatron_param_and_grad(self.ref_module, torch.cuda.current_device(), self._is_offload_grad)
micro_batch_size = self.config.rollout.log_prob_micro_batch_size micro_batch_size = self.config.rollout.log_prob_micro_batch_size_per_gpu
data.meta_info['micro_batch_size'] = micro_batch_size data.meta_info['micro_batch_size'] = micro_batch_size
data.meta_info['temperature'] = self.config.rollout.temperature data.meta_info['temperature'] = self.config.rollout.temperature
output = self.ref_policy.compute_log_prob(data=data) output = self.ref_policy.compute_log_prob(data=data)
...@@ -439,7 +445,9 @@ class CriticWorker(MegatronWorker): ...@@ -439,7 +445,9 @@ class CriticWorker(MegatronWorker):
# normalize config # normalize config
self.config.ppo_mini_batch_size //= mpu.get_data_parallel_world_size() self.config.ppo_mini_batch_size //= mpu.get_data_parallel_world_size()
self.config.ppo_micro_batch_size //= mpu.get_data_parallel_world_size() if self.config.ppo_micro_batch_size is not None:
self.config.ppo_micro_batch_size //= mpu.get_data_parallel_world_size()
self.config.ppo_micro_batch_size_per_gpu = self.config.ppo_micro_batch_size
# TODO(sgm): support critic model offload # TODO(sgm): support critic model offload
...@@ -609,7 +617,9 @@ class RewardModelWorker(MegatronWorker): ...@@ -609,7 +617,9 @@ class RewardModelWorker(MegatronWorker):
set_random_seed(seed=self.config.megatron.seed) set_random_seed(seed=self.config.megatron.seed)
# normalize config # normalize config
self.config.micro_batch_size //= mpu.get_data_parallel_world_size() if self.config.micro_batch_size is not None:
self.config.micro_batch_size //= mpu.get_data_parallel_world_size()
self.config.micro_batch_size_per_gpu = self.config.micro_batch_size
def _build_rm_model(self, model_path, megatron_config: ModelParallelConfig, override_model_config): def _build_rm_model(self, model_path, megatron_config: ModelParallelConfig, override_model_config):
from megatron.core.models.gpt.gpt_model import ModelType from megatron.core.models.gpt.gpt_model import ModelType
......
...@@ -196,8 +196,8 @@ class MegatronRewardModel(BasePPORewardModel): ...@@ -196,8 +196,8 @@ class MegatronRewardModel(BasePPORewardModel):
group=mpu.get_pipeline_model_parallel_group()) group=mpu.get_pipeline_model_parallel_group())
# split into micro-batches # split into micro-batches
if self.config is not None and 'ppo_micro_batch_size' in self.config: if self.config is not None and 'ppo_micro_batch_size_per_gpu' in self.config:
infer_batch_size = self.config.ppo_micro_batch_size infer_batch_size = self.config.ppo_micro_batch_size_per_gpu
else: else:
infer_batch_size = data.batch.batch_size[0] infer_batch_size = data.batch.batch_size[0]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment