Unverified Commit f2a76acd by Guangming Sheng Committed by GitHub

[BREAKING][misc] feat: change micro_batch_size to micro_batch_size_per_gpu (#136)

## Summary

This PR changes all the micro_batch_size to micro_batch_size_per_gpu.

**The Core logic of setting batch size:**
- **All algorithmic metrics** (train batch size, ppo mini batch size):
are global (from the perspective of single-controller), which will be
normalized in each Worker.
- **All performance-related parameters** (micro batch size, max token
length in dynamic batch size) are local parameters, which represent the
data sizes per GPU (i.e., each Worker).

## Main Changes

1. Change the scripts and config and delete the normalization for
micro_bsz
2. Fix CI for SFT
parent c17e6c62
......@@ -43,4 +43,4 @@ jobs:
- name: Running gsm8k e2e training tests with LoRA
run: |
ray stop --force
bash examples/sft/gsm8k/run_qwen_05_peft.sh 8 $HOME/ckpts/
\ No newline at end of file
bash tests/sft/run_sft_qwen05_peft.sh 8 $HOME/ckpts/
\ No newline at end of file
......@@ -59,60 +59,79 @@ Actor/Rollout/Reference Policy
.. code:: yaml
actor_rollout_ref:
hybrid_engine: True
model:
path: ~/models/deepseek-llm-7b-chat
external_lib: null
override_config: {}
enable_gradient_checkpointing: False
actor:
strategy: fsdp # This is for backward-compatibility
ppo_mini_batch_size: 256
ppo_micro_batch_size: 64
grad_clip: 1.0
clip_ratio: 0.2
entropy_coeff: 0.001
ppo_epochs: 1
shuffle: True
optim:
lr: 1e-6
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
min_lr_ratio: null # only useful for warmup with cosine
warmup_style: constant # select from constant/cosine
total_training_steps: -1 # must be override by program
fsdp_config:
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
param_offload: False
grad_offload: False
optimizer_offload: False
ref:
fsdp_config:
param_offload: False
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
log_prob_micro_batch_size: 128
rollout:
name: vllm
temperature: 1.0
top_k: -1 # 0 for hf rollout, -1 for vllm rollout
top_p: 1
response_length: ${data.max_response_length}
# for vllm rollout
dtype: bfloat16 # should align with FSDP
gpu_memory_utilization: 0.5
ignore_eos: False
enforce_eager: True
free_cache_engine: True
load_format: dummy_dtensor # or dummy_hf or dummy_megatron
tensor_model_parallel_size: 2
max_num_batched_tokens: 8192
max_num_seqs: 1024
log_prob_micro_batch_size: 128
# for vllm and hf rollout
do_sample: True
hybrid_engine: True
model:
path: ~/models/deepseek-llm-7b-chat
external_lib: null
override_config: { }
enable_gradient_checkpointing: False
use_remove_padding: False
actor:
strategy: fsdp # This is for backward-compatibility
ppo_mini_batch_size: 256
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: 8
use_dynamic_bsz: False
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
grad_clip: 1.0
clip_ratio: 0.2
entropy_coeff: 0.001
use_kl_loss: False # True for GRPO
kl_loss_coef: 0.001 # for grpo
kl_loss_type: low_var_kl # for grpo
ppo_epochs: 1
shuffle: False
ulysses_sequence_parallel_size: 1 # sp size
optim:
lr: 1e-6
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
min_lr_ratio: null # only useful for warmup with cosine
warmup_style: constant # select from constant/cosine
total_training_steps: -1 # must be override by program
fsdp_config:
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
param_offload: False
grad_offload: False
optimizer_offload: False
fsdp_size: -1
ref:
fsdp_config:
param_offload: False
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 16
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
rollout:
name: vllm
temperature: 1.0
top_k: -1 # 0 for hf rollout, -1 for vllm rollout
top_p: 1
prompt_length: ${data.max_prompt_length} # not use for opensource
response_length: ${data.max_response_length}
# for vllm rollout
dtype: bfloat16 # should align with FSDP
gpu_memory_utilization: 0.5
ignore_eos: False
enforce_eager: True
free_cache_engine: True
load_format: dummy_dtensor
tensor_model_parallel_size: 2
max_num_batched_tokens: 8192
max_num_seqs: 1024
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 16
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
# for hf rollout
do_sample: True
# number of responses (i.e. num sample times)
n: 1 # > 1 for grpo
**Common config for actor, rollout and reference model**
......@@ -136,11 +155,15 @@ Actor/Rollout/Reference Policy
- ``actor_rollout_ref.actor.ppo_mini_batch_size``: One sample is split
into multiple sub-batches with batch_size=ppo_mini_batch_size for PPO
updates
updates. The ppo_mini_batch_size is a global num across all workers/gpus
- ``actor_rollout_ref.actor.ppo_micro_batch_size``: [Will be deprecated, use ppo_micro_batch_size_per_gpu]
Similar to gradient accumulation, the micro_batch_size_per_gpu for one forward pass,
trading speed for GPU memory. The value represent the global view.
- ``actor_rollout_ref.actor.ppo_micro_batch_size``: Similar to gradient
accumulation, the micro_batch_size for one forward pass, trading speed
for GPU memory
- ``actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu``: Similar to gradient
accumulation, the micro_batch_size_per_gpu for one forward pass, trading speed
for GPU memory. The value represent the local num per gpu.
- ``actor_rollout_ref.actor.grad_clip``: Gradient clipping for actor
updates
......@@ -176,8 +199,12 @@ Actor/Rollout/Reference Policy
- ``actor_rollout_ref.ref``: FSDP config same as actor. **For models
larger than 7B, it's recommended to turn on offload for ref by
default**
- ``actor_rollout_ref.ref.log_prob_micro_batch_size``: The batch size
for one forward pass in the computation of ``ref_log_prob``.
- ``actor_rollout_ref.ref.log_prob_micro_batch_size``: [Will be deprecate, use log_prob_micro_batch_size_per_gpu]
The batch size for one forward pass in the computation of ``ref_log_prob``. The value represent the global num.
- ``actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu``: The batch size
for one forward pass in the computation of ``ref_log_prob``. The value represent the local num per gpu.
**Rollout Model**
......@@ -201,8 +228,11 @@ Actor/Rollout/Reference Policy
- ``tensor_model_parallel_size``: TP size for rollout. Only effective
for vllm.
- ``log_prob_micro_batch_size``: Micro_batch_size (The batch size for
one forward pass) for recalculating log_prob.
- ``actor_rollout_ref.ref.log_prob_micro_batch_size``: [Will be deprecate, use log_prob_micro_batch_size_per_gpu]
The batch size for one forward pass in the computation of ``log_prob``. The value represent the global num.
- ``log_prob_micro_batch_size_per_gpu``: Micro batch size per gpu (The batch size for
one forward pass) for recalculating ``log_prob``. The value represent the local num per gpu.
- ``do_sample``: Whether to sample. If set to False, the rollout model
will perform greedy sampling. We disable ``do_sample`` during
......@@ -260,7 +290,7 @@ Reward Model
fsdp_config:
min_num_params: 0
param_offload: False
micro_batch_size: 64
micro_batch_size_per_gpu: 16
max_length: null
- ``reward_model.enable``: Whether to enable reward model. If False, we
......
......@@ -85,7 +85,7 @@ We also provide various training scripts for SFT on GSM8K dataset in `gsm8k sft
data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=question \
data.response_key=answer \
data.micro_batch_size=8 \
data.micro_batch_size_per_gpu=8 \
model.partial_pretrain=deepseek-ai/deepseek-coder-6.7b-instruct \
trainer.default_hdfs_dir=hdfs://user/verl/experiments/gsm8k/deepseek-coder-6.7b-instruct/ \
trainer.project_name=gsm8k-sft \
......@@ -136,21 +136,20 @@ The script of run_deepseek7b_llm.sh
actor_rollout_ref.model.path=~/models/deepseek-llm-7b-chat \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=64 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.micro_batch_size=256 \
actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.path=~/models/deepseek-llm-7b-chat \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=64 \
critic.ppo_micro_batch_size_per_gpu=16 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
......
......@@ -92,14 +92,14 @@ Set the ``data.train_files`` ,\ ``data.val_files``, ``actor_rollout_ref.model.pa
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.ppo_micro_batch_size=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size=8 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=4 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
critic.optim.lr=1e-5 \
critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \
critic.ppo_micro_batch_size=4 \
critic.ppo_micro_batch_size_per_gpu=4 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.logger=['console'] \
+trainer.val_before_train=False \
......@@ -133,8 +133,8 @@ If you encounter out of memory issues with HBM less than 32GB, enable the follow
.. code-block:: bash
actor_rollout_ref.actor.ppo_micro_batch_size=1 \
critic.ppo_micro_batch_size=1 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
critic.ppo_micro_batch_size_per_gpu=1 \
For the full set of configs, please refer to :ref:`config-explain-page` for detailed explaination and performance tuning.
......
......@@ -12,7 +12,7 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=128 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=80 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
......@@ -20,16 +20,16 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=256 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=160 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size=256 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=160 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.logger=['console'] \
trainer.project_name='verl_grpo_example_gsm8k' \
trainer.experiment_name='deepseek_llm_7b_function_rm' \
trainer.n_gpus_per_node=8 \
......
......@@ -14,7 +14,7 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=128 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=80 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
......@@ -22,12 +22,12 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=256 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=160 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size=256 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=160 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
......
......@@ -11,21 +11,22 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=32 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=32 \
critic.model.enable_gradient_checkpointing=True \
critic.ppo_micro_batch_size_per_gpu=32 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
......@@ -37,4 +38,5 @@ python3 -m verl.trainer.main_ppo \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=1 \
trainer.total_epochs=15 $@
......@@ -11,24 +11,24 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=128 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \
actor_rollout_ref.actor.ulysses_sequence_parallel_size=2 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=256 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.ref.log_prob_micro_batch_size=256 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.ulysses_sequence_parallel_size=2 \
critic.model.use_remove_padding=True \
critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=64 \
critic.model.enable_gradient_checkpointing=True \
critic.ppo_micro_batch_size_per_gpu=64 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
......
......@@ -13,21 +13,21 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat
actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
actor_rollout_ref.actor.ppo_micro_batch_size=16 \
actor_rollout_ref.rollout.log_prob_micro_batch_size=16 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=16 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.ref.param_offload=False \
critic.optim.lr=1e-5 \
critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=16 \
critic.ppo_micro_batch_size_per_gpu=4 \
reward_model.enable=True \
reward_model.megatron.tensor_model_parallel_size=4 \
reward_model.model.path=deepseek-ai/deepseek-llm-7b-chat \
reward_model.micro_batch_size=16 \
reward_model.micro_batch_size_per_gpu=4 \
reward_model.param_offload=False \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
......
......@@ -18,16 +18,16 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat
actor_rollout_ref.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=32 \
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
critic.optim.lr=1e-5 \
critic.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=32 \
critic.ppo_micro_batch_size_per_gpu=4 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
......
......@@ -10,16 +10,16 @@ python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megat
actor_rollout_ref.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \
actor_rollout_ref.actor.optim.lr=2e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=64 \
actor_rollout_ref.rollout.log_prob_micro_batch_size=64 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
critic.optim.lr=2e-5 \
critic.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=64 \
critic.ppo_micro_batch_size_per_gpu=8 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
......
......@@ -11,21 +11,21 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
actor_rollout_ref.actor.ppo_micro_batch_size=4 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=4 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.model.path=google/gemma-2-2b-it \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=4 \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
......
......@@ -19,21 +19,22 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=16 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=16 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=16 \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.model.path=Qwen/Qwen2-7B-Instruct \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=16 \
critic.model.enable_gradient_checkpointing=True \
critic.ppo_micro_batch_size_per_gpu=32 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
......
......@@ -23,22 +23,22 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=16 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=16 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.ref.log_prob_micro_batch_size=16 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.optim.lr_warmup_steps_ratio=0.05 \
critic.model.path=Qwen/Qwen2-7B-Instruct \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=16 \
critic.ppo_micro_batch_size_per_gpu=16 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
......@@ -46,7 +46,7 @@ python3 -m verl.trainer.main_ppo \
reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\
reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size=16 \
reward_model.micro_batch_size_per_gpu=32 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
......
......@@ -45,7 +45,7 @@ python3 -m verl.trainer.main_ppo \
reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\
reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size=16 \
reward_model.micro_batch_size_per_gpu=32 \
reward_model.use_dynamic_bsz=True \
reward_model.forward_max_token_len_per_gpu=98304 \
algorithm.kl_ctrl.kl_coef=0.001 \
......
......@@ -20,21 +20,22 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=16 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \
actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.model.path=Qwen/Qwen2.5-32B-Instruct \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=32 \
critic.ppo_micro_batch_size_per_gpu=8 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
......
......@@ -646,7 +646,7 @@
"\u001b[36m(main_task pid=28294)\u001b[0m 'path': '/teamspace/studios/this_studio/models/Qwen2.5-0.5B-Instruct'},\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'ref': {'fsdp_config': {'param_offload': False,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'wrap_policy': {'min_num_params': 0}},\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'log_prob_micro_batch_size': 4},\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'log_prob_micro_batch_size_per_gpu': 4},\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'rollout': {'do_sample': True,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'dtype': 'bfloat16',\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'enforce_eager': True,\n",
......@@ -654,7 +654,7 @@
"\u001b[36m(main_task pid=28294)\u001b[0m 'gpu_memory_utilization': 0.4,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'ignore_eos': False,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'load_format': 'dummy_dtensor',\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'log_prob_micro_batch_size': 1,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'log_prob_micro_batch_size_per_gpu': 1,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'max_num_batched_tokens': 8192,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'max_num_seqs': 1024,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'n': 1,\n",
......@@ -671,7 +671,7 @@
"\u001b[36m(main_task pid=28294)\u001b[0m 'kl_penalty': 'kl',\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'lam': 1.0},\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'critic': {'cliprange_value': 0.5,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'forward_micro_batch_size': 4,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'forward_micro_batch_size_per_gpu': 4,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'grad_clip': 1.0,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'model': {'enable_gradient_checkpointing': False,\n",
"\u001b[36m(main_task pid=28294)\u001b[0m 'external_lib': None,\n",
......@@ -1110,10 +1110,10 @@
" actor_rollout_ref.actor.optim.lr=1e-6 \\\n",
" actor_rollout_ref.actor.ppo_mini_batch_size=64 \\\n",
" actor_rollout_ref.actor.ppo_micro_batch_size=1 \\\n",
" actor_rollout_ref.rollout.log_prob_micro_batch_size=1 \\\n",
" actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \\\n",
" actor_rollout_ref.rollout.tensor_model_parallel_size=1 \\\n",
" actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \\\n",
" actor_rollout_ref.ref.log_prob_micro_batch_size=4 \\\n",
" actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \\\n",
" critic.optim.lr=1e-5 \\\n",
" critic.model.path=$HOME/models/Qwen2.5-0.5B-Instruct \\\n",
" critic.ppo_micro_batch_size=1 \\\n",
......
set -x
hdfs_path=hdfs://user/verl/experiments/gsm8k/deepseek-coder-6.7b-instruct/ # replace to your own hdfs/local path
if [ "$#" -lt 2 ]; then
echo "Usage: run_deepseek_6b7.sh <nproc_per_node> <save_path> [other_configs...]"
exit 1
fi
nproc_per_node=$1
save_path=$2
# Shift the arguments so $@ refers to the rest
shift 2
torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
-m verl.trainer.fsdp_sft_trainer \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=prompt \
data.response_key=answer \
data.micro_batch_size=8 \
data.prompt_key=extra_info \
data.response_key=extra_info \
+data.prompt_dict_keys=['question'] \
+data.response_dict_keys=['answer'] \
data.micro_batch_size_per_gpu=4 \
model.partial_pretrain=deepseek-ai/deepseek-coder-6.7b-instruct \
trainer.default_hdfs_dir=$hdfs_path \
trainer.default_local_dir=$save_path \
trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-deepseek-coder-6.7b-instruct \
trainer.total_epochs=4 \
trainer.logger=['console','wandb']
\ No newline at end of file
trainer.logger=['console','wandb'] \
trainer.default_hdfs_dir=null $@
\ No newline at end of file
......@@ -21,7 +21,7 @@ torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
data.response_key=extra_info \
+data.prompt_dict_keys=['question'] \
+data.response_dict_keys=['answer'] \
data.micro_batch_size=8 \
data.micro_batch_size_per_gpu=4 \
model.partial_pretrain=google/gemma-2b-it \
trainer.default_local_dir=$save_path \
trainer.project_name=gsm8k-sft \
......
set -x
hdfs_path=hdfs://user/verl/experiments/gsm8k/gemma-1.1-7b-it/ # replace to your own hdfs/local path
if [ "$#" -lt 2 ]; then
echo "Usage: run_gemma_7b.sh <nproc_per_node> <save_path> [other_configs...]"
exit 1
fi
nproc_per_node=$1
save_path=$2
# Shift the arguments so $@ refers to the rest
shift 2
torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
-m verl.trainer.fsdp_sft_trainer \
......@@ -10,10 +17,11 @@ torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=prompt \
data.response_key=answer \
data.micro_batch_size=8 \
data.micro_batch_size_per_gpu=4 \
model.partial_pretrain=google/gemma-1.1-7b-it \
trainer.default_hdfs_dir=$hdfs_path \
trainer.default_local_dir=$save_path \
trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-gemma-1.1-7b-it \
trainer.total_epochs=4 \
trainer.logger=['console','wandb']
\ No newline at end of file
trainer.logger=['console','wandb'] \
trainer.default_hdfs_dir=null $@
\ No newline at end of file
......@@ -22,13 +22,13 @@ torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
optim.lr=1e-4 \
+data.prompt_dict_keys=['question'] \
+data.response_dict_keys=['answer'] \
data.micro_batch_size=32 \
data.micro_batch_size_per_gpu=4 \
model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \
trainer.default_local_dir=$save_path \
trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-qwen-2.5-0.5b-instruct \
trainer.logger=['console'] \
trainer.total_training_steps=1 \
trainer.total_epochs=1 \
trainer.default_hdfs_dir=null $@ \
model.lora_rank=32\
model.lora_alpha=16 \
......
......@@ -20,7 +20,8 @@ actor_rollout_ref:
actor:
strategy: fsdp # This is for backward-compatibility
ppo_mini_batch_size: 256
ppo_micro_batch_size: 64
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: 64
grad_clip: 1.0
clip_ratio: 0.2
entropy_coeff: 0.001
......@@ -45,7 +46,8 @@ actor_rollout_ref:
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
log_prob_micro_batch_size: 128
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 128
rollout:
name: vllm
temperature: 1.0
......@@ -63,7 +65,8 @@ actor_rollout_ref:
tensor_model_parallel_size: 2
max_num_batched_tokens: 8192
max_num_seqs: 1024
log_prob_micro_batch_size: 128
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 128
# for hf rollout
do_sample: True
# number of responses (i.e. num sample times)
......@@ -91,7 +94,8 @@ critic:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
ppo_micro_batch_size: 64
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: 64
ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
shuffle: ${actor_rollout_ref.actor.shuffle}
grad_clip: 1.0
......@@ -107,7 +111,8 @@ reward_model:
fsdp_config:
min_num_params: 0
param_offload: False
micro_batch_size: 64
micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
micro_batch_size_per_gpu: 64
max_length: null
algorithm:
......
......@@ -10,20 +10,20 @@ python3 main_ppo_split.py \
actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=16 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=32 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=32 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=16 \
critic.ppo_micro_batch_size_per_gpu=8 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
......
......@@ -22,7 +22,8 @@ actor_rollout_ref:
actor:
strategy: fsdp # This is for backward-compatibility
ppo_mini_batch_size: 200
ppo_micro_batch_size: 200
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: null
use_dynamic_bsz: False
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
grad_clip: 1.0
......@@ -50,7 +51,8 @@ actor_rollout_ref:
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
micro_batch_size: 200
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: null
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
......@@ -65,14 +67,15 @@ actor_rollout_ref:
dtype: bfloat16 # should align with FSDP
gpu_memory_utilization: 0.1
ignore_eos: False
micro_batch_size: 200
micro_batch_size_per_gpu: 200
enforce_eager: True
free_cache_engine: True
load_format: dummy_dtensor
tensor_model_parallel_size: 1
max_num_batched_tokens: 8192
max_num_seqs: 1024
log_prob_micro_batch_size: 200
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: null
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
# for hf rollout
......@@ -80,6 +83,7 @@ actor_rollout_ref:
# number of responses (i.e. num sample times)
n: 1 # > 1 for grpo
critic:
strategy: fsdp
optim:
......@@ -100,8 +104,10 @@ critic:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
ppo_micro_batch_size: 200
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: null
forward_micro_batch_size: ${critic.ppo_micro_batch_size}
forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
......@@ -128,7 +134,8 @@ reward_model:
fsdp_config:
min_num_params: 0
fsdp_size: -1
micro_batch_size: 8
micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
micro_batch_size_per_gpu: null # set a number
max_length: null
ulysses_sequence_parallel_size: 1 # sp size
......
......@@ -105,14 +105,6 @@ def main(config):
from omegaconf import OmegaConf
pprint(OmegaConf.to_container(config, resolve=True)) # resolve=True will eval symbol values
dp_size = config.trainer.n_gpus_per_node * config.trainer.nnodes
# normalize batch_size
# TODO: move this inside each role
config.actor_rollout_ref.actor.ppo_mini_batch_size //= dp_size
config.actor_rollout_ref.actor.ppo_micro_batch_size //= dp_size
config.critic.ppo_micro_batch_size //= dp_size
config.actor_rollout_ref.rollout.micro_batch_size //= dp_size
# print the config
# print initial config
print('Config after normalizing batch_size')
......
......@@ -13,21 +13,21 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=32 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=32 \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
......
......@@ -13,21 +13,21 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=False \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=32 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=False \
critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=32 \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
......
......@@ -15,22 +15,22 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=32 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.optim.lr_warmup_steps_ratio=0.05 \
critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=32 \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
......@@ -38,7 +38,7 @@ python3 -m verl.trainer.main_ppo \
reward_model.model.path=Qwen/Qwen2.5-0.5B\
reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size=16 \
reward_model.micro_batch_size_per_gpu=16 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
......
......@@ -15,22 +15,22 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.model.use_remove_padding=False \
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=32 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=False \
critic.optim.lr_warmup_steps_ratio=0.05 \
critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=32 \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
......@@ -38,7 +38,7 @@ python3 -m verl.trainer.main_ppo \
reward_model.model.path=Qwen/Qwen2.5-0.5B\
reward_model.model.use_remove_padding=False \
reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size=16 \
reward_model.micro_batch_size_per_gpu=16 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
+trainer.val_before_train=False \
......
......@@ -15,13 +15,13 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=32 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.use_dynamic_bsz=True \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=12000 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
......@@ -33,7 +33,7 @@ python3 -m verl.trainer.main_ppo \
critic.optim.lr_warmup_steps_ratio=0.05 \
critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=32 \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.use_dynamic_bsz=True \
critic.ppo_max_token_len_per_gpu=98304 \
critic.model.fsdp_config.param_offload=False \
......@@ -43,7 +43,7 @@ python3 -m verl.trainer.main_ppo \
reward_model.model.path=Qwen/Qwen2.5-0.5B\
reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size=16 \
reward_model.micro_batch_size_per_gpu=16 \
reward_model.use_dynamic_bsz=True \
reward_model.forward_max_token_len_per_gpu=98304 \
algorithm.kl_ctrl.kl_coef=0.001 \
......
......@@ -15,17 +15,17 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=32 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.ulysses_sequence_parallel_size=2 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \
actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.ulysses_sequence_parallel_size=2 \
......@@ -33,7 +33,7 @@ python3 -m verl.trainer.main_ppo \
critic.optim.lr_warmup_steps_ratio=0.05 \
critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=32 \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.grad_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
......@@ -43,7 +43,7 @@ python3 -m verl.trainer.main_ppo \
reward_model.model.path=Qwen/Qwen2.5-0.5B\
reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size=16 \
reward_model.micro_batch_size_per_gpu=16 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
+trainer.val_before_train=False \
......
......@@ -11,6 +11,10 @@ python3 tests/e2e/arithmetic_sequence/rl/main_trainer.py \
data.train_files=tests/e2e/arithmetic_sequence/data/train.parquet \
data.val_files=tests/e2e/arithmetic_sequence/data/test.parquet \
actor_rollout_ref.model.path=tests/e2e/arithmetic_sequence/model \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
critic.ppo_micro_batch_size_per_gpu=1 \
critic.model.path=tests/e2e/arithmetic_sequence/model | tee $OUTPUT_FILE;
python3 tests/e2e/check_results.py --output_file=$OUTPUT_FILE
......
......@@ -10,7 +10,7 @@ torchrun --standalone --nnodes=1 --nproc_per_node=8 \
data.response_key=extra_info \
+data.prompt_dict_keys=['question'] \
+data.response_dict_keys=['answer'] \
data.micro_batch_size=32 \
data.micro_batch_size_per_gpu=32 \
model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \
trainer.default_local_dir=$HOME/ckpts/ \
trainer.project_name=qwen2.5-sft \
......
# Tested with 2 & 4 GPUs
set -x
if [ "$#" -lt 2 ]; then
echo "Usage: run_qwen_05_peft.sh <nproc_per_node> <save_path> [other_configs...]"
exit 1
fi
nproc_per_node=$1
save_path=$2
# Shift the arguments so $@ refers to the rest
shift 2
torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
-m verl.trainer.fsdp_sft_trainer \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=extra_info \
data.response_key=extra_info \
optim.lr=1e-4 \
+data.prompt_dict_keys=['question'] \
+data.response_dict_keys=['answer'] \
data.micro_batch_size_per_gpu=4 \
model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \
trainer.default_local_dir=$save_path \
trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-qwen-2.5-0.5b-instruct \
trainer.logger=['console'] \
trainer.total_training_steps=1 \
trainer.default_hdfs_dir=null $@ \
model.lora_rank=32\
model.lora_alpha=16 \
model.target_modules=all-linear
# Or you can do this:
# model.target_modules=[q_proj,v_proj] \
......@@ -23,13 +23,13 @@ rollout:
dtype: bfloat16 # should align with FSDP
gpu_memory_utilization: 0.5
ignore_eos: False
micro_batch_size: 256
enforce_eager: True
free_cache_engine: True
load_format: dummy_dtensor
tensor_model_parallel_size: 1
max_num_batched_tokens: 8192
max_num_seqs: 1024
log_prob_micro_batch_size: 8
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: 8
# for hf rollout
do_sample: True
\ No newline at end of file
......@@ -20,7 +20,9 @@ actor_rollout_ref:
actor:
strategy: megatron # This is for backward-compatibility
ppo_mini_batch_size: 256
ppo_micro_batch_size: 64
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: null
use_dynamic_bsz: False
clip_ratio: 0.2
entropy_coeff: 0.001
ppo_epochs: 1
......@@ -48,7 +50,8 @@ actor_rollout_ref:
seed: 1
load_weight: True
param_offload: False
log_prob_micro_batch_size: 32
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: null
rollout:
name: vllm
temperature: 1.0
......@@ -66,7 +69,8 @@ actor_rollout_ref:
tensor_model_parallel_size: 2
max_num_batched_tokens: 8192
max_num_seqs: 1024
log_prob_micro_batch_size: 2
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: null
# for hf rollout
do_sample: True
layer_name_map:
......@@ -98,7 +102,9 @@ critic:
seed: 1
load_weight: True
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
ppo_micro_batch_size: 2
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: null
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
shuffle: ${actor_rollout_ref.actor.shuffle}
cliprange_value: 0.5
......@@ -121,7 +127,9 @@ reward_model:
external_lib: ${actor_rollout_ref.model.external_lib}
load_weight: True
param_offload: False
micro_batch_size: 64
micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
micro_batch_size_per_gpu: null
use_dynamic_bsz: ${critic.use_dynamic_bsz}
max_length: null
algorithm:
......
......@@ -21,7 +21,8 @@ actor_rollout_ref:
actor:
strategy: fsdp # This is for backward-compatibility
ppo_mini_batch_size: 256
ppo_micro_batch_size: 64
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: null
use_dynamic_bsz: False
ppo_max_token_len_per_gpu: 16384 # n * ${data.max_prompt_length} + ${data.max_response_length}
grad_clip: 1.0
......@@ -53,7 +54,8 @@ actor_rollout_ref:
wrap_policy:
# transformer_layer_cls_to_wrap: None
min_num_params: 0
log_prob_micro_batch_size: 128
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: null
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size} # sp size
......@@ -74,7 +76,8 @@ actor_rollout_ref:
tensor_model_parallel_size: 2
max_num_batched_tokens: 8192
max_num_seqs: 1024
log_prob_micro_batch_size: 128
log_prob_micro_batch_size: null # will be deprecated, use log_prob_micro_batch_size_per_gpu
log_prob_micro_batch_size_per_gpu: null
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
# for hf rollout
......@@ -106,8 +109,10 @@ critic:
min_num_params: 0
fsdp_size: -1
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
ppo_micro_batch_size: 64
ppo_micro_batch_size: null # will be deprecated, use ppo_micro_batch_size_per_gpu
ppo_micro_batch_size_per_gpu: null
forward_micro_batch_size: ${critic.ppo_micro_batch_size}
forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
ppo_max_token_len_per_gpu: 32768 # (${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}) * 2
forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
......@@ -129,7 +134,8 @@ reward_model:
min_num_params: 0
param_offload: False
fsdp_size: -1
micro_batch_size: 64
micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
micro_batch_size_per_gpu: null # set a number
max_length: null
ulysses_sequence_parallel_size: 1 # sp size
use_dynamic_bsz: ${critic.use_dynamic_bsz}
......
data:
train_batch_size: 256
micro_batch_size: 16 # this is also val batch size
micro_batch_size: null # will be deprecated, use micro_batch_size_per_gpu
micro_batch_size_per_gpu: 4 # this is also val batch size
train_files: ~/data/gsm8k/train.parquet
val_files: ~/data/gsm8k/test.parquet
prompt_key: question
......
......@@ -97,10 +97,10 @@ class FSDPSFTTrainer(object):
print(f'Normalize batch size by dp {dp_size}')
assert self.config.data.train_batch_size % dp_size == 0
assert self.config.data.micro_batch_size % dp_size == 0
self.config.data.train_batch_size //= dp_size
self.config.data.micro_batch_size //= dp_size
assert self.config.data.train_batch_size % self.config.data.micro_batch_size_per_gpu == 0
def _build_dataloader(self):
config = self.config
......@@ -143,7 +143,7 @@ class FSDPSFTTrainer(object):
rank=rank,
drop_last=True)
self.val_dataloader = DataLoader(dataset=self.val_dataset,
batch_size=config.data.micro_batch_size,
batch_size=config.data.micro_batch_size_per_gpu,
sampler=self.val_sampler,
num_workers=8,
pin_memory=True,
......@@ -285,7 +285,7 @@ class FSDPSFTTrainer(object):
log_gpu_memory_usage('After optimizer zero_grad', logger=logger)
micro_batches = batch.split(self.config.data.micro_batch_size)
micro_batches = batch.split(self.config.data.micro_batch_size_per_gpu)
n_micro_batches = len(micro_batches)
step_loss = 0
for micro_batch in micro_batches:
......@@ -373,7 +373,7 @@ class FSDPSFTTrainer(object):
# Perform final validation
val_losses = []
for val_data in self.val_dataloader:
val_data = TensorDict(val_data, batch_size=self.config.data.micro_batch_size).cuda()
val_data = TensorDict(val_data, batch_size=self.config.data.micro_batch_size_per_gpu).cuda()
val_loss = self.validation_step(val_data)
val_losses.append(val_loss)
if rank == 0:
......@@ -389,7 +389,7 @@ class FSDPSFTTrainer(object):
# validation
val_losses = []
for data in self.val_dataloader:
data = TensorDict(data, batch_size=self.config.data.micro_batch_size).cuda()
data = TensorDict(data, batch_size=self.config.data.micro_batch_size_per_gpu).cuda()
val_loss = self.validation_step(data)
val_losses.append(val_loss)
if rank == 0:
......
......@@ -337,8 +337,13 @@ class RayPPOTrainer(object):
else:
self.kl_ctrl = core_algos.FixedKLController(kl_coef=0.)
self._validate_config()
self._create_dataloader()
def _validate_config(self):
from verl.utils.config import validate_config
validate_config(self.config)
def _create_dataloader(self):
from torch.utils.data import DataLoader
# TODO: we have to make sure the batch size is divisible by the dp size
......
......@@ -21,3 +21,69 @@ def update_dict_with_config(dictionary: Dict, config: DictConfig):
for key in dictionary:
if hasattr(config, key):
dictionary[key] = getattr(config, key)
def validate_config(config):
# number of GPUs total
n_gpus = config.trainer.n_gpus_per_node * config.trainer.nnodes
# 1. Check total batch size for data correctness
real_train_batch_size = config.data.train_batch_size * config.actor_rollout_ref.rollout.n
assert real_train_batch_size % n_gpus == 0, \
f"real_train_batch_size ({real_train_batch_size}) must be divisible by total n_gpus ({n_gpus})."
# A helper function to check "micro_batch_size" vs "micro_batch_size_per_gpu"
# We throw an error if the user sets both. The new convention is "..._micro_batch_size_per_gpu".
def check_mutually_exclusive(mbs, mbs_per_gpu, name: str):
if mbs is None and mbs_per_gpu is None:
raise ValueError(f"[{name}] Please set at least one of '{name}.micro_batch_size' or "
f"'{name}.micro_batch_size_per_gpu'.")
if mbs is not None and mbs_per_gpu is not None:
raise ValueError(f"[{name}] You have set both '{name}.micro_batch_size' AND "
f"'{name}.micro_batch_size_per_gpu'. Please remove '{name}.micro_batch_size' "
f"because only '*_micro_batch_size_per_gpu' is supported (the former is deprecated).")
if not config.actor_rollout_ref.actor.use_dynamic_bsz:
# actor: ppo_micro_batch_size vs. ppo_micro_batch_size_per_gpu
check_mutually_exclusive(config.actor_rollout_ref.actor.ppo_micro_batch_size,
config.actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu, "actor_rollout_ref.actor")
# reference: log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
check_mutually_exclusive(config.actor_rollout_ref.ref.log_prob_micro_batch_size,
config.actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu,
"actor_rollout_ref.ref")
# The rollout section also has log_prob_micro_batch_size vs. log_prob_micro_batch_size_per_gpu
check_mutually_exclusive(config.actor_rollout_ref.rollout.log_prob_micro_batch_size,
config.actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu,
"actor_rollout_ref.rollout")
if not config.critic.use_dynamic_bsz:
# Check for critic micro-batch size conflicts
check_mutually_exclusive(config.critic.ppo_micro_batch_size, config.critic.ppo_micro_batch_size_per_gpu,
"critic")
# Check for reward model micro-batch size conflicts
if config.reward_model.enable and not config.reward_model.use_dynamic_bsz:
check_mutually_exclusive(config.reward_model.micro_batch_size, config.reward_model.micro_batch_size_per_gpu,
"reward_model")
# Actor
# if NOT dynamic_bsz, we must ensure:
# ppo_mini_batch_size is divisible by ppo_micro_batch_size
# ppo_micro_batch_size * sequence_parallel_size >= n_gpus
if not config.actor_rollout_ref.actor.use_dynamic_bsz:
sp_size = config.actor_rollout_ref.actor.ulysses_sequence_parallel_size
if config.actor_rollout_ref.actor.ppo_micro_batch_size is not None:
assert config.actor_rollout_ref.actor.ppo_mini_batch_size % config.actor_rollout_ref.actor.ppo_micro_batch_size == 0
assert config.actor_rollout_ref.actor.ppo_micro_batch_size * sp_size >= n_gpus
# critic
if not config.critic.use_dynamic_bsz:
sp_size = config.critic.ulysses_sequence_parallel_size
if config.critic.ppo_micro_batch_size is not None:
assert config.critic.ppo_mini_batch_size % config.critic.ppo_micro_batch_size == 0
assert config.critic.ppo_micro_batch_size * sp_size >= n_gpus
print("[validate_config] All configuration checks passed successfully!")
......@@ -204,8 +204,8 @@ class DataParallelPPOActor(BasePPOActor):
# make sure we are in training mode
self.actor_module.train()
assert self.config.ppo_mini_batch_size % self.config.ppo_micro_batch_size == 0
self.gradient_accumulation = self.config.ppo_mini_batch_size // self.config.ppo_micro_batch_size
assert self.config.ppo_mini_batch_size % self.config.ppo_micro_batch_size_per_gpu == 0
self.gradient_accumulation = self.config.ppo_mini_batch_size // self.config.ppo_micro_batch_size_per_gpu
temperature = data.meta_info['temperature'] # temperature must be in the data.meta_info to avoid slient error
select_keys = ['responses', 'input_ids', 'attention_mask', 'position_ids', 'old_log_probs', 'advantages']
......@@ -226,7 +226,7 @@ class DataParallelPPOActor(BasePPOActor):
micro_batches, _ = rearrange_micro_batches(batch=mini_batch, max_token_len=max_token_len)
else:
# split batch into micro_batches
micro_batches = mini_batch.split(self.config.ppo_micro_batch_size)
micro_batches = mini_batch.split(self.config.ppo_micro_batch_size_per_gpu)
self.actor_optimizer.zero_grad()
......
......@@ -54,7 +54,7 @@ class MegatronPPOActor(BasePPOActor):
Args:
config (OmegaConf): the basic config that contains the hyper-parameters of PPO Actor. It must contain
``ppo_micro_batch_size``: minibatch size when updating ppo.
``ppo_micro_batch_size_per_gpu``: micro batch size when updating ppo.
``ppo_mini_batch_size``: minibatch size when updating ppo using the batch data.
......@@ -232,7 +232,7 @@ class MegatronPPOActor(BasePPOActor):
if data.meta_info.get('micro_batch_size', None) is not None:
batch_size = data.meta_info['micro_batch_size']
else:
batch_size = self.config.ppo_micro_batch_size
batch_size = self.config.ppo_micro_batch_size_per_gpu
batches = split_dict_tensor_into_batches(data.batch, batch_size=batch_size)
# compute input shapes for pp stages
input_shapes = compute_transformers_input_shapes(
......
......@@ -45,8 +45,8 @@ class DataParallelPPOCritic(BasePPOCritic):
self.use_remove_padding = self.config.model.get('use_remove_padding', False)
print(f'Critic use_remove_padding={self.use_remove_padding}')
assert self.config.ppo_mini_batch_size % self.config.ppo_micro_batch_size == 0
self.gradient_accumulation = self.config.ppo_mini_batch_size // self.config.ppo_micro_batch_size
assert self.config.ppo_mini_batch_size % self.config.ppo_micro_batch_size_per_gpu == 0
self.gradient_accumulation = self.config.ppo_mini_batch_size // self.config.ppo_micro_batch_size_per_gpu
self.ulysses_sequence_parallel_size = self.config.get('ulysses_sequence_parallel_size', 1)
......@@ -161,7 +161,7 @@ class DataParallelPPOCritic(BasePPOCritic):
max_token_len = self.config.ppo_max_token_len_per_gpu * self.ulysses_sequence_parallel_size
micro_batches, _ = rearrange_micro_batches(batch=mini_batch, max_token_len=max_token_len)
else:
micro_batches = mini_batch.split(self.config.ppo_micro_batch_size)
micro_batches = mini_batch.split(self.config.ppo_micro_batch_size_per_gpu)
self.critic_optimizer.zero_grad()
......
......@@ -118,7 +118,7 @@ class MegatronPPOCritic(BasePPOCritic):
group=mpu.get_pipeline_model_parallel_group())
# split into micro-batches
data.batch['attention_mask'] = data.batch['attention_mask'].to(bool)
batches = split_dict_tensor_into_batches(data.batch, batch_size=self.config.ppo_micro_batch_size)
batches = split_dict_tensor_into_batches(data.batch, batch_size=self.config.ppo_micro_batch_size_per_gpu)
n_micro_batch = len(batches)
seq_len = batches[0]['input_ids'].shape[1]
......@@ -182,7 +182,7 @@ class MegatronPPOCritic(BasePPOCritic):
model=self.critic_module,
num_microbatches=n_micro_batch,
input_shapes=input_shapes, # must set for flash-attn sequence packing
seq_length=self.config.ppo_micro_batch_size * seq_len, # no use when input_shapes was set
seq_length=self.config.ppo_micro_batch_size_per_gpu * seq_len, # no use when input_shapes was set
hidden_size=self.model_config.hidden_size, # no use when input_shapes was set
micro_batch_size=1, # no use when input_shapes was set
forward_only=forward_only,
......@@ -193,7 +193,7 @@ class MegatronPPOCritic(BasePPOCritic):
data_iterator=batch_generator,
model=self.critic_module,
num_microbatches=n_micro_batch,
seq_length=self.config.ppo_micro_batch_size * seq_len, # in use for pp = 1
seq_length=self.config.ppo_micro_batch_size_per_gpu * seq_len, # in use for pp = 1
hidden_size=self.model_config.hidden_size, # in use for pp = 1
micro_batch_size=1, # in use for pp = 1
forward_only=forward_only,
......
......@@ -119,18 +119,22 @@ class ActorRolloutRefWorker(Worker):
# normalize config
if self._is_actor:
self.config.actor.ppo_mini_batch_size //= (self.device_mesh.shape[0] // self.ulysses_sequence_parallel_size)
self.config.actor.ppo_micro_batch_size //= (self.device_mesh.shape[0] //
self.ulysses_sequence_parallel_size)
self.config.actor.ppo_mini_batch_size *= self.config.rollout.n
self.config.actor.ppo_micro_batch_size *= self.config.rollout.n
if self._is_rollout:
# micro bsz
if self.config.actor.ppo_micro_batch_size is not None:
self.config.actor.ppo_micro_batch_size //= (self.device_mesh.shape[0] //
self.ulysses_sequence_parallel_size)
self.config.actor.ppo_micro_batch_size_per_gpu = self.config.actor.ppo_micro_batch_size
# normalize rollout config
if self._is_rollout and self.config.rollout.log_prob_micro_batch_size is not None:
self.config.rollout.log_prob_micro_batch_size //= (self.device_mesh.shape[0] //
self.ulysses_sequence_parallel_size)
self.config.rollout.log_prob_micro_batch_size *= self.config.rollout.n
if self._is_ref:
self.config.rollout.log_prob_micro_batch_size_per_gpu = self.config.rollout.log_prob_micro_batch_size
# normalize ref config
if self._is_ref and self.config.ref.log_prob_micro_batch_size is not None:
self.config.ref.log_prob_micro_batch_size //= (self.device_mesh.shape[0] //
self.ulysses_sequence_parallel_size)
self.config.ref.log_prob_micro_batch_size *= self.config.rollout.n
self.config.ref.log_prob_micro_batch_size_per_gpu = self.config.ref.log_prob_micro_batch_size
def _build_model_optimizer(self,
model_path,
......@@ -424,8 +428,6 @@ class ActorRolloutRefWorker(Worker):
@register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
def generate_sequences(self, prompts: DataProto):
prompts = prompts.to('cuda')
# set to False if it is validation
recompute_log_prob = prompts.meta_info.get('recompute_log_prob', True)
assert self._is_rollout
if self._is_offload_param:
......@@ -461,7 +463,7 @@ class ActorRolloutRefWorker(Worker):
assert self._is_actor
data = data.to('cuda')
# we should always recompute old_log_probs when it is HybridEngine
data.meta_info['micro_batch_size'] = self.config.rollout.log_prob_micro_batch_size
data.meta_info['micro_batch_size'] = self.config.rollout.log_prob_micro_batch_size_per_gpu
data.meta_info['max_token_len'] = self.config.rollout.log_prob_max_token_len_per_gpu
data.meta_info['use_dynamic_bsz'] = self.config.rollout.log_prob_use_dynamic_bsz
data.meta_info['temperature'] = self.config.rollout.temperature
......@@ -489,7 +491,7 @@ class ActorRolloutRefWorker(Worker):
data = data.to('cuda')
micro_batch_size = self.config.ref.log_prob_micro_batch_size
micro_batch_size = self.config.ref.log_prob_micro_batch_size_per_gpu
data.meta_info['micro_batch_size'] = micro_batch_size
data.meta_info['temperature'] = self.config.rollout.temperature
data.meta_info['max_token_len'] = self.config.ref.log_prob_max_token_len_per_gpu
......@@ -573,9 +575,13 @@ class CriticWorker(Worker):
# normalize config
self.config.ppo_mini_batch_size //= (torch.distributed.get_world_size() // self.ulysses_sequence_parallel_size)
self.config.ppo_micro_batch_size //= (torch.distributed.get_world_size() // self.ulysses_sequence_parallel_size)
self.config.forward_micro_batch_size //= (torch.distributed.get_world_size() //
if self.config.ppo_micro_batch_size is not None:
self.config.ppo_micro_batch_size //= (torch.distributed.get_world_size() //
self.ulysses_sequence_parallel_size)
self.config.forward_micro_batch_size //= (torch.distributed.get_world_size() //
self.ulysses_sequence_parallel_size)
self.config.ppo_micro_batch_size_per_gpu = self.config.ppo_micro_batch_size
self.config.forward_micro_batch_size_per_gpu = self.config.forward_micro_batch_size
def _build_critic_model_optimizer(self, config):
# the following line is necessary
......@@ -724,7 +730,7 @@ class CriticWorker(Worker):
load_fsdp_param_and_grad(module=self.critic_module,
device_id=torch.cuda.current_device(),
load_grad=self._is_offload_grad)
micro_batch_size = self.config.forward_micro_batch_size
micro_batch_size = self.config.forward_micro_batch_size_per_gpu
data.meta_info['micro_batch_size'] = micro_batch_size
data.meta_info['max_token_len'] = self.config.forward_max_token_len_per_gpu
data.meta_info['use_dynamic_bsz'] = self.config.use_dynamic_bsz
......@@ -838,7 +844,11 @@ class RewardModelWorker(Worker):
self.ulysses_sharding_manager = FSDPUlyssesShardingManager(self.ulysses_device_mesh)
self.use_remove_padding = self.config.model.get('use_remove_padding', False)
self.config.micro_batch_size //= torch.distributed.get_world_size()
# normalize config
if self.config.micro_batch_size is not None:
self.config.micro_batch_size //= torch.distributed.get_world_size()
self.config.micro_batch_size_per_gpu = self.config.micro_batch_size
def _build_model(self, config):
# the following line is necessary
......@@ -1054,7 +1064,7 @@ class RewardModelWorker(Worker):
max_token_len = self.config.forward_max_token_len_per_gpu * self.ulysses_sequence_parallel_size
micro_batches, indices = rearrange_micro_batches(batch=rm_data.batch, max_token_len=max_token_len)
else:
micro_batches = rm_data.batch.split(self.config.micro_batch_size)
micro_batches = rm_data.batch.split(self.config.micro_batch_size_per_gpu)
output = []
for micro_batch in micro_batches:
rm_score = self._forward_micro_batch(micro_batch)
......
......@@ -112,13 +112,19 @@ class ActorRolloutRefWorker(MegatronWorker):
# normalize config
if self._is_actor and self._is_rollout:
self.config.actor.ppo_mini_batch_size //= mpu.get_data_parallel_world_size()
self.config.actor.ppo_micro_batch_size //= mpu.get_data_parallel_world_size()
self.config.rollout.log_prob_micro_batch_size //= mpu.get_data_parallel_world_size()
if self.config.actor.ppo_micro_batch_size is not None:
self.config.actor.ppo_micro_batch_size //= mpu.get_data_parallel_world_size()
self.config.rollout.log_prob_micro_batch_size //= mpu.get_data_parallel_world_size()
self.config.actor.ppo_micro_batch_size_per_gpu = self.config.actor.ppo_micro_batch_size
self.config.rollout.log_prob_micro_batch_size_per_gpu = self.config.rollout.log_prob_micro_batch_size
self._is_offload_param = self.config.actor.get('param_offload', False)
self._is_offload_grad = self.config.actor.get('grad_offload', False)
self._is_offload_optimizer = self.config.actor.get('optimizer_offload', False)
elif self._is_ref:
self.config.ref.log_prob_micro_batch_size //= mpu.get_data_parallel_world_size()
if self.config.ref.ppo_micro_batch_size is not None:
self.config.ref.log_prob_micro_batch_size //= mpu.get_data_parallel_world_size()
self.config.ref.ppo_micro_batch_size_per_gpu = self.config.ref.ppo_micro_batch_size
self._is_offload_param = self.config.ref.get('param_offload', False)
def _build_model_optimizer(self,
......@@ -361,7 +367,7 @@ class ActorRolloutRefWorker(MegatronWorker):
validate = prompts.meta_info.get('validate', False)
if self._is_actor and not validate:
# we should always recompute old_log_probs when it is HybridEngine
output.meta_info['micro_batch_size'] = self.config.rollout.log_prob_micro_batch_size
output.meta_info['micro_batch_size'] = self.config.rollout.log_prob_micro_batch_size_per_gpu
output.meta_info['temperature'] = self.config.rollout.temperature
old_log_probs = self.actor.compute_log_prob(data=output)
output.batch['old_log_probs'] = old_log_probs
......@@ -380,7 +386,7 @@ class ActorRolloutRefWorker(MegatronWorker):
if self._is_offload_param:
load_megatron_param_and_grad(self.ref_module, torch.cuda.current_device(), self._is_offload_grad)
micro_batch_size = self.config.rollout.log_prob_micro_batch_size
micro_batch_size = self.config.rollout.log_prob_micro_batch_size_per_gpu
data.meta_info['micro_batch_size'] = micro_batch_size
data.meta_info['temperature'] = self.config.rollout.temperature
output = self.ref_policy.compute_log_prob(data=data)
......@@ -439,7 +445,9 @@ class CriticWorker(MegatronWorker):
# normalize config
self.config.ppo_mini_batch_size //= mpu.get_data_parallel_world_size()
self.config.ppo_micro_batch_size //= mpu.get_data_parallel_world_size()
if self.config.ppo_micro_batch_size is not None:
self.config.ppo_micro_batch_size //= mpu.get_data_parallel_world_size()
self.config.ppo_micro_batch_size_per_gpu = self.config.ppo_micro_batch_size
# TODO(sgm): support critic model offload
......@@ -609,7 +617,9 @@ class RewardModelWorker(MegatronWorker):
set_random_seed(seed=self.config.megatron.seed)
# normalize config
self.config.micro_batch_size //= mpu.get_data_parallel_world_size()
if self.config.micro_batch_size is not None:
self.config.micro_batch_size //= mpu.get_data_parallel_world_size()
self.config.micro_batch_size_per_gpu = self.config.micro_batch_size
def _build_rm_model(self, model_path, megatron_config: ModelParallelConfig, override_model_config):
from megatron.core.models.gpt.gpt_model import ModelType
......
......@@ -196,8 +196,8 @@ class MegatronRewardModel(BasePPORewardModel):
group=mpu.get_pipeline_model_parallel_group())
# split into micro-batches
if self.config is not None and 'ppo_micro_batch_size' in self.config:
infer_batch_size = self.config.ppo_micro_batch_size
if self.config is not None and 'ppo_micro_batch_size_per_gpu' in self.config:
infer_batch_size = self.config.ppo_micro_batch_size_per_gpu
else:
infer_batch_size = data.batch.batch_size[0]
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment