Commit ee74bb52 by Yaoyu Zhu

update gitignore

parent f2982c41
......@@ -11,6 +11,7 @@
**/wandb
**/ret_one
*.slurm*
*.sh*
# Byte-compiled / optimized / DLL files
__pycache__/
......
set -x
# the config file used: verl/trainer/main_ppo/config/ppo_megatron_trainer.yaml
huggingface-cli download deepseek-ai/deepseek-llm-7b-chat
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
actor_rollout_ref.actor.optim.lr=2e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=4 \
critic.optim.lr=2e-5 \
critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.megatron.pipeline_model_parallel_size=2 \
critic.megatron.virtual_pipeline_model_parallel_size=2 \
critic.megatron.tensor_model_parallel_size=4 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_megatron_checkpoint' \
trainer.experiment_name='deepseek_megatron_checkpoint_saveload' \
trainer.n_gpus_per_node=16 \
trainer.nnodes=1 \
trainer.save_freq=100 \
trainer.test_freq=1 \
trainer.total_epochs=15 \
trainer.total_training_steps=100 $@
python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
actor_rollout_ref.actor.optim.lr=2e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=4 \
critic.optim.lr=2e-5 \
critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.megatron.pipeline_model_parallel_size=2 \
critic.megatron.virtual_pipeline_model_parallel_size=2 \
critic.megatron.tensor_model_parallel_size=4 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_megatron_checkpoint' \
trainer.experiment_name='deepseek_megatron_checkpoint_saveload' \
trainer.n_gpus_per_node=16 \
trainer.nnodes=1 \
trainer.resume_mode=auto \
trainer.save_freq=-1 \
trainer.test_freq=1 \
trainer.total_epochs=15 \
trainer.total_training_steps=150 $@
\ No newline at end of file
set -x
# the config file used: verl/trainer/main_ppo/config/ppo_megatron_trainer.yaml
huggingface-cli download Qwen/Qwen2-7B-Instruct
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
actor_rollout_ref.actor.optim.lr=2e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=4 \
critic.optim.lr=2e-5 \
critic.model.path=Qwen/Qwen2-7B-Instruct \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.megatron.pipeline_model_parallel_size=2 \
critic.megatron.virtual_pipeline_model_parallel_size=2 \
critic.megatron.tensor_model_parallel_size=4 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_megatron_checkpoint' \
trainer.experiment_name='qwen2_7b_megatron_saveload' \
trainer.n_gpus_per_node=16 \
trainer.nnodes=1 \
trainer.save_freq=100 \
trainer.test_freq=1 \
trainer.total_epochs=15 \
trainer.total_training_steps=100 $@
python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
actor_rollout_ref.actor.optim.lr=2e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=4 \
critic.optim.lr=2e-5 \
critic.model.path=Qwen/Qwen2-7B-Instruct \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.megatron.pipeline_model_parallel_size=2 \
critic.megatron.virtual_pipeline_model_parallel_size=2 \
critic.megatron.tensor_model_parallel_size=4 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_megatron_checkpoint' \
trainer.experiment_name='qwen2_7b_megatron_saveload' \
trainer.n_gpus_per_node=16 \
trainer.nnodes=1 \
trainer.resume_mode=auto \
trainer.save_freq=-1 \
trainer.test_freq=1 \
trainer.total_epochs=15 \
trainer.total_training_steps=150 $@
\ No newline at end of file
set -x
data_path=$HOME/data/rlhf/gsm8k/test.parquet
save_path=$HOME/data/rlhf/math/deepseek_v2_lite_gen_test.parquet
model_path=deepseek-ai/deepseek-llm-7b-chat
python3 -m verl.trainer.main_generation \
trainer.nnodes=2 \
trainer.n_gpus_per_node=8 \
data.path=$data_path \
data.prompt_key=prompt \
data.n_samples=1 \
data.output_path=$save_path \
model.path=$model_path\
+model.trust_remote_code=True \
rollout.temperature=1.0 \
rollout.top_k=50 \
rollout.top_p=0.7 \
rollout.prompt_length=2048 \
rollout.response_length=1024 \
rollout.tensor_model_parallel_size=16 \
rollout.gpu_memory_utilization=0.8
python3 -m verl.trainer.main_generation \
trainer.nnodes=1 \
trainer.n_gpus_per_node=8 \
data.path=~/data/rlhf/gsm8k/test.parquet \
data.prompt_key=prompt \
data.n_samples=1 \
data.output_path=~/data/rlhf/math/deepseek_v2_lite_gen_test.parquet \
model.path=deepseek-ai/deepseek-llm-7b-chat \
+model.trust_remote_code=True \
rollout.temperature=1.0 \
rollout.top_k=50 \
rollout.top_p=0.7 \
rollout.prompt_length=2048 \
rollout.response_length=1024 \
rollout.tensor_model_parallel_size=2 \
rollout.gpu_memory_utilization=0.8
set -x
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=80 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=160 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=160 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl_grpo_example_gsm8k' \
trainer.experiment_name='deepseek_llm_7b_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
\ No newline at end of file
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet
train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=1024 \
data.max_prompt_length=1024 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=40 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=40 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=40 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_grpo_example_gsm8k' \
trainer.experiment_name='deepseek_llm_7b_function_rm_math' \
trainer.n_gpus_per_node=16 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
\ No newline at end of file
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet
train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"
python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
algorithm.adv_estimator=grpo \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=1024 \
data.max_prompt_length=1024 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=4 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_grpo_example_gsm8k' \
trainer.experiment_name='deepseek_llm_7b_function_rm_math_megatron' \
trainer.n_gpus_per_node=16 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
\ No newline at end of file
set -x
python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
algorithm.adv_estimator=grpo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=4 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_grpo_example_gsm8k' \
trainer.experiment_name='deepseek_llm_7b_function_rm_megatron' \
trainer.n_gpus_per_node=16 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
\ No newline at end of file
set -x
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.use_dynamic_bsz=True \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_grpo_example_gsm8k' \
trainer.experiment_name='deepseek_llm_7b_function_rm_seq_packing' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
\ No newline at end of file
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=40 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=40 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=40 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_grpo_example_gsm8k' \
trainer.experiment_name='qwen2_7b_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
\ No newline at end of file
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet
train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=1024 \
data.max_prompt_length=1024 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_grpo_example_gsm8k' \
trainer.experiment_name='qwen2_7b_function_rm' \
trainer.n_gpus_per_node=16 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
\ No newline at end of file
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet
train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"
python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
algorithm.adv_estimator=grpo \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=1024 \
data.max_prompt_length=1024 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=4 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_grpo_example_gsm8k' \
trainer.experiment_name='qwen2_7b_function_rm_megatron' \
trainer.n_gpus_per_node=16 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
\ No newline at end of file
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
algorithm.adv_estimator=grpo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=4 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_grpo_example_gsm8k' \
trainer.experiment_name='qwen2_7b_function_rm_megatron' \
trainer.n_gpus_per_node=16 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
\ No newline at end of file
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.use_dynamic_bsz=True \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_grpo_example_gsm8k' \
trainer.experiment_name='qwen2_7b_function_rm_kl1e-3' \
+trainer.val_before_train=False \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
\ No newline at end of file
set -x
ENGINE=${1:-vllm}
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=$HOME/data/geo3k/train.parquet \
data.val_files=$HOME/data/geo3k/test.parquet \
data.train_batch_size=512 \
data.max_prompt_length=1024 \
data.max_response_length=2048 \
data.filter_overlong_prompts=True \
data.truncation='error' \
data.image_key=images \
actor_rollout_ref.model.path=Qwen/Qwen2.5-VL-7B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=10 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.01 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=$ENGINE \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.enable_chunked_prefill=False \
actor_rollout_ref.rollout.enforce_eager=False \
actor_rollout_ref.rollout.free_cache_engine=False \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_grpo_example_geo3k' \
trainer.experiment_name='qwen2_5_vl_7b_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
\ No newline at end of file
set -x
python3 -m verl.trainer.main_ppo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
critic.model.enable_gradient_checkpointing=True \
critic.ppo_micro_batch_size_per_gpu=32 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_example_gsm8k' \
trainer.experiment_name='deepseek_llm_7b_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=1 \
trainer.total_epochs=15 $@
set -x
VERL_USE_MODELSCOPE=True \
python3 -m verl.trainer.main_ppo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
critic.model.enable_gradient_checkpointing=True \
critic.ppo_micro_batch_size_per_gpu=32 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_example_gsm8k' \
trainer.experiment_name='deepseek_llm_7b_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=1 \
trainer.total_epochs=15 $@
set -x
python3 -m verl.trainer.main_ppo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=32 \
actor_rollout_ref.actor.ulysses_sequence_parallel_size=2 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=64 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=64 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.ulysses_sequence_parallel_size=2 \
critic.model.use_remove_padding=True \
critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
critic.model.enable_gradient_checkpointing=True \
critic.ppo_micro_batch_size_per_gpu=64 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_example_gsm8k' \
trainer.experiment_name='deepseek_llm_7b_function_rm_sp2' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
set -x
train_files=$HOME/data/full_hh_rlhf/rl/train.parquet
test_files=$HOME/data/full_hh_rlhf/rl/train.parquet # no use
python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=512 \
data.max_prompt_length=128 \
data.max_response_length=128 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.ref.param_offload=False \
critic.optim.lr=1e-5 \
critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
reward_model.enable=True \
reward_model.megatron.tensor_model_parallel_size=4 \
reward_model.model.path=deepseek-ai/deepseek-llm-7b-chat \
reward_model.micro_batch_size_per_gpu=4 \
reward_model.param_offload=False \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_megatron_full_hh_rlhf_examples' \
trainer.experiment_name='deepseek_llm_7b_model_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=100 $@
set -x
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet
train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"
python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=1024 \
data.max_prompt_length=1024 \
data.max_response_length=512 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
critic.optim.lr=1e-5 \
critic.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_megatron_math_gsm8k_examples' \
trainer.experiment_name='deepseek_llm_7b_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=100 $@
set -x
# prepare pre-trained model ckpt
# deepseek-llm-7b-chat has 30 layers, which is not good to use with PP=2 and VPP=2, try using a 6.7b model instead
# huggingface-cli download deepseek-ai/deepseek-llm-7b-chat --local-dir $HOME/models/deepseek-llm-7b-chat
huggingface-cli download deepseek-ai/deepseek-coder-6.7b-instruct
# ``actor_rollout_ref.rollout.tensor_model_parallel_size`` in theory could be different from
# ``**.megatron.tensor_model_parallel_size``
# the config file used: verl/trainer/main_ppo/config/ppo_megatron_trainer.yaml
# tested on L20-16 GPUs per nodes, for other machines please adjust the n_gpus_per_node config accordingly
python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \
actor_rollout_ref.actor.optim.lr=2e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=4 \
critic.optim.lr=2e-5 \
critic.model.path=deepseek-ai/deepseek-coder-6.7b-instruct \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.megatron.pipeline_model_parallel_size=2 \
critic.megatron.virtual_pipeline_model_parallel_size=2 \
critic.megatron.tensor_model_parallel_size=4 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_megatron_gsm8k_examples' \
trainer.experiment_name='deepseek_llm_7b_function_rm' \
trainer.n_gpus_per_node=16 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.total_epochs=15 \
+trainer.val_before_train=False $@
set -x
python3 -m verl.trainer.main_ppo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=512 \
data.max_prompt_length=1024 \
data.max_response_length=512 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=google/gemma-2-2b-it \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=False \
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=False \
critic.model.path=google/gemma-2-2b-it \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_example' \
trainer.experiment_name='gemma2b_function_rm' \
trainer.n_gpus_per_node=2 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=10 \
trainer.total_epochs=15 $@
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet
train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"
python3 -m verl.trainer.main_ppo --config-path=./config --config-name='ppo_megatron_trainer'\
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=1024 \
data.max_prompt_length=1024 \
data.max_response_length=512 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
critic.optim.lr=1e-5 \
critic.model.path=Qwen/Qwen2-7B-Instruct \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_megatron_math_gsm8k_examples' \
trainer.experiment_name='qwen2_7b_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=100 $@
set -x
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet
train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"
# tested on L20-16 GPUs per nodes, for other machines please adjust the n_gpus_per_node config accordingly
python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=1024 \
data.max_prompt_length=1024 \
data.max_response_length=512 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=4 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=4 \
critic.optim.lr=1e-5 \
critic.model.path=Qwen/Qwen2-7B-Instruct \
critic.model.enable_gradient_checkpointing=True \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.megatron.pipeline_model_parallel_size=2 \
critic.megatron.virtual_pipeline_model_parallel_size=2 \
critic.megatron.tensor_model_parallel_size=4 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_megatron_gsm8k_examples' \
trainer.experiment_name='Qwen2-7B-Instruct_function_rm_megatron' \
trainer.n_gpus_per_node=16 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=10 \
trainer.total_epochs=15 $@
# Discliamer: the model used in the script is only for academic purpose.
set -x
# Data preparation scripts are available in ``examples/data_preprocess``.
# Example usage:
#
# python3 examples/data_preprocess/math_dataset.py --local_dir ~/data/math
# python3 examples/data_preprocess/gsm8k.py --local_dir ~/data/gsm8k
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet
train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"
export VLLM_ATTENTION_BACKEND=XFORMERS # vllm + qwen2-7b with flash_attn has some issues
# prepare model ckpt
huggingface-cli download Qwen/Qwen2-7B-Instruct --local-dir $HOME/models/Qwen2-7B-Instruct &
huggingface-cli download sfairXC/FsfairX-LLaMA3-RM-v0.1 --local-dir $HOME/models/FsfairX-LLaMA3-RM-v0.1 &
wait
python3 -m verl.trainer.main_ppo \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=1024 \
data.max_prompt_length=1024 \
data.max_response_length=512 \
data.filter_overlong_prompts=True \
data.truncation='error' \
data.return_raw_chat=True \
actor_rollout_ref.model.path="$HOME/models/Qwen2-7B-Instruct" \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.optim.lr_warmup_steps_ratio=0.05 \
critic.model.path="$HOME/models/Qwen2-7B-Instruct" \
critic.model.enable_gradient_checkpointing=True \
critic.ppo_micro_batch_size_per_gpu=32 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
reward_model.enable=True \
reward_model.model.path="$HOME/models/FsfairX-LLaMA3-RM-v0.1" \
reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size_per_gpu=32 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_example' \
+trainer.val_before_train=False \
trainer.experiment_name='Qwen2-7B-Instruct_hybrid_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
set -x
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet
train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"
python3 -m verl.trainer.main_ppo \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=4096 \
data.max_prompt_length=4096 \
data.max_response_length=4096 \
data.filter_overlong_prompts=True \
data.truncation='error' \
data.return_raw_chat=True \
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.ppo_mini_batch_size=512 \
actor_rollout_ref.actor.use_dynamic_bsz=True \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=24000 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=24000 \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.model.path=Qwen/Qwen2-7B-Instruct \
critic.model.enable_gradient_checkpointing=True \
critic.use_dynamic_bsz=True \
critic.ppo_max_token_len_per_gpu=98304 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
reward_model.enable=True \
reward_model.model.path=sfairXC/FsfairX-LLaMA3-RM-v0.1\
reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size_per_gpu=32 \
reward_model.use_dynamic_bsz=True \
reward_model.forward_max_token_len_per_gpu=98304 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_example_gsm8k' \
trainer.experiment_name='qwen2-7b_hybrid_rm_bsz8k_p4k_r4k_seq_packing' \
trainer.n_gpus_per_node=8 \
+trainer.val_before_train=False \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
set -x
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet
train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"
python3 -m verl.trainer.main_ppo \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=4096 \
data.max_prompt_length=4096 \
data.max_response_length=4096 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.ppo_mini_batch_size=512 \
actor_rollout_ref.actor.use_dynamic_bsz=True \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=24000 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=24000 \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.model.path=Qwen/Qwen2-7B-Instruct \
critic.model.enable_gradient_checkpointing=True \
critic.ppo_max_token_len_per_gpu=98304 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_example_gsm8k' \
trainer.experiment_name='qwen2-7b_function_rm_bsz8k_p4k_r4k_seq_packing' \
trainer.n_gpus_per_node=8 \
+trainer.val_before_train=False \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
set -x
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet
train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"
python3 -m verl.trainer.main_ppo \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=1024 \
data.max_prompt_length=1024 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=Qwen/Qwen2.5-32B-Instruct \
actor_rollout_ref.model.enable_gradient_checkpointing=False \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.model.path=Qwen/Qwen2.5-32B-Instruct \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=8 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
algorithm.kl_ctrl.kl_coef=0.0001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_example' \
trainer.experiment_name='Qwen2.5-32B-Instruct_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=4 \
trainer.save_freq=-1 \
trainer.test_freq=10 \
trainer.total_epochs=15 $@
set -x
export HF_DATASETS_OFFLINE=1
export TRANSFORMERS_OFFLINE=1
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=remax \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=512 \
data.max_prompt_length=512 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=Qwen/Qwen2.5-3B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
actor_rollout_ref.actor.use_dynamic_bsz=True \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=30000 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
actor_rollout_ref.rollout.n=4 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_remax_example_gsm8k' \
trainer.experiment_name='qwen2.5_3b_function_rm_kl1e-3' \
+trainer.val_before_train=False \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=5 $@
set -x
export HF_DATASETS_OFFLINE=1
export TRANSFORMERS_OFFLINE=1
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=remax \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=Qwen/Qwen2.5-7B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.use_dynamic_bsz=True \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
actor_rollout_ref.rollout.n=4 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_remax_example_gsm8k' \
trainer.experiment_name='qwen2.5_7b_function_rm_kl1e-3' \
+trainer.val_before_train=False \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=10 $@
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=rloo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=Qwen/Qwen2-7B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=80 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.grad_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=160 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=160 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_rloo_example_gsm8k' \
trainer.experiment_name='qwen2_7b_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=15 $@
\ No newline at end of file
set -x
if [ "$#" -lt 2 ]; then
echo "Usage: run_deepseek_6b7.sh <nproc_per_node> <save_path> [other_configs...]"
exit 1
fi
nproc_per_node=$1
save_path=$2
# Shift the arguments so $@ refers to the rest
shift 2
torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
-m verl.trainer.fsdp_sft_trainer \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=extra_info \
data.response_key=extra_info \
+data.prompt_dict_keys=['question'] \
+data.response_dict_keys=['answer'] \
data.micro_batch_size_per_gpu=4 \
model.partial_pretrain=deepseek-ai/deepseek-coder-6.7b-instruct \
trainer.default_local_dir=$save_path \
trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-deepseek-coder-6.7b-instruct \
trainer.total_epochs=4 \
trainer.logger=['console','wandb'] \
trainer.default_hdfs_dir=null $@
\ No newline at end of file
# Tested with 2 & 4 GPUs
set -x
if [ "$#" -lt 2 ]; then
echo "Usage: run_gemma_2b.sh <nproc_per_node> <save_path> [other_configs...]"
exit 1
fi
nproc_per_node=$1
save_path=$2
# Shift the arguments so $@ refers to the rest
shift 2
torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
-m verl.trainer.fsdp_sft_trainer \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=extra_info \
data.response_key=extra_info \
+data.prompt_dict_keys=['question'] \
+data.response_dict_keys=['answer'] \
data.micro_batch_size_per_gpu=4 \
model.partial_pretrain=google/gemma-2b-it \
trainer.default_local_dir=$save_path \
trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-gemma-2b-it \
trainer.total_epochs=2 \
trainer.logger=['console','wandb'] \
trainer.default_hdfs_dir=null $@
\ No newline at end of file
set -x
if [ "$#" -lt 2 ]; then
echo "Usage: run_gemma_7b.sh <nproc_per_node> <save_path> [other_configs...]"
exit 1
fi
nproc_per_node=$1
save_path=$2
# Shift the arguments so $@ refers to the rest
shift 2
torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
-m verl.trainer.fsdp_sft_trainer \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=prompt \
data.response_key=answer \
data.micro_batch_size_per_gpu=4 \
model.partial_pretrain=google/gemma-1.1-7b-it \
trainer.default_local_dir=$save_path \
trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-gemma-1.1-7b-it \
trainer.total_epochs=4 \
trainer.logger=['console','wandb'] \
trainer.default_hdfs_dir=null $@
\ No newline at end of file
# Tested with 2 & 4 GPUs
set -x
if [ "$#" -lt 2 ]; then
echo "Usage: run_qwen_05_peft.sh <nproc_per_node> <save_path> [other_configs...]"
exit 1
fi
nproc_per_node=$1
save_path=$2
# Shift the arguments so $@ refers to the rest
shift 2
torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
-m verl.trainer.fsdp_sft_trainer \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=extra_info \
data.response_key=extra_info \
optim.lr=1e-4 \
+data.prompt_dict_keys=['question'] \
+data.response_dict_keys=['answer'] \
data.micro_batch_size_per_gpu=4 \
model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \
trainer.default_local_dir=$save_path \
trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-qwen-2.5-0.5b-instruct \
trainer.logger=['console'] \
trainer.total_epochs=1 \
trainer.default_hdfs_dir=null $@ \
model.lora_rank=32\
model.lora_alpha=16 \
model.target_modules=all-linear
# Or you can do this:
# model.target_modules=[q_proj,v_proj] \
set -x
if [ "$#" -lt 2 ]; then
echo "Usage: run_qwen_05_sp2.sh <nproc_per_node> <save_path> [other_configs...]"
exit 1
fi
nproc_per_node=$1
save_path=$2
# Shift the arguments so $@ refers to the rest
shift 2
torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
-m verl.trainer.fsdp_sft_trainer \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=extra_info \
data.response_key=extra_info \
optim.lr=1e-4 \
+data.prompt_dict_keys=['question'] \
+data.response_dict_keys=['answer'] \
data.micro_batch_size=4 \
model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \
trainer.default_local_dir=$save_path \
trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-qwen-2.5-0.5b-instruct-sp2 \
trainer.logger=['console'] \
trainer.total_training_steps=1 \
trainer.default_hdfs_dir=null $@ \
ulysses_sequence_parallel_size=2 \
use_remove_padding=true
set -x
if [ "$#" -lt 2 ]; then
echo "Usage: run_qwen_05_sp2.sh <nproc_per_node> <save_path> [other_configs...]"
exit 1
fi
nproc_per_node=$1
save_path=$2
# Shift the arguments so $@ refers to the rest
shift 2
torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
-m verl.trainer.fsdp_sft_trainer \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=extra_info \
data.response_key=extra_info \
optim.lr=1e-4 \
+data.prompt_dict_keys=['question'] \
+data.response_dict_keys=['answer'] \
data.micro_batch_size=4 \
model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \
model.use_liger=True \
trainer.default_local_dir=$save_path \
trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-qwen-2.5-0.5b-instruct-sp2-liger \
trainer.logger=['console'] \
trainer.default_hdfs_dir=null $@ \
ulysses_sequence_parallel_size=2 \
use_remove_padding=true
#!/bin/bash
#SBATCH --job-name=verl-ray-on-slurm
#SBATCH --nodes=2
#SBATCH --ntasks-per-node=1
#SBATCH --mem=200G
#SBATCH --partition=your-partition
#SBATCH --time=01:00:00
#SBATCH --account=your-account
#SBATCH --gpus-per-node=4
#SBATCH --cpus-per-task=64
#SBATCH --output=slurm-%j.out
#SBATCH --error=slurm-%j.err
# load necessary modules
# replace these information with your own
verl_workdir=/path/to/verl
train_files=/path/to/gsm8k/train.parquet
val_files=/path/to/gsm8k/test.parquet
apptainer_image_path=/path/to/verl-ngc.sif
# replace these information with your own
# Getting the node names
nodes=$(scontrol show hostnames "$SLURM_JOB_NODELIST")
nodes_array=("$nodes")
head_node=${nodes_array[0]}
head_node_ip=$(srun --nodes=1 --ntasks=1 -w "$head_node" hostname --ip-address)
# if we detect a space character in the head node IP, we'll
# convert it to an ipv4 address. This step is optional.
if [[ "$head_node_ip" == *" "* ]]; then
IFS=' ' read -ra ADDR <<<"$head_node_ip"
if [[ ${#ADDR[0]} -gt 16 ]]; then
head_node_ip=${ADDR[1]}
else
head_node_ip=${ADDR[0]}
fi
echo "IPV6 address detected. We split the IPV4 address as $head_node_ip"
fi
port=6379
ip_head=$head_node_ip:$port
export ip_head
echo "IP Head: $ip_head"
# make sure we set environment variables before Ray initialization
export VLLM_ATTENTION_BACKEND=XFORMERS
printenv
echo "Starting HEAD at $head_node"
srun --nodes=1 --ntasks=1 -w "$head_node" \
apptainer run --nv --bind $verl_workdir $apptainer_image_path \
ray start --head --node-ip-address="$head_node_ip" --port=$port \
--num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block &
# optional, though may be useful in certain versions of Ray < 1.0.
sleep 10
# number of nodes other than the head node
worker_num=$((SLURM_JOB_NUM_NODES - 1))
for ((i = 1; i <= worker_num; i++)); do
node_i=${nodes_array[$i]}
echo "Starting WORKER $i at $node_i"
srun --nodes=1 --ntasks=1 -w "$node_i" \
apptainer run --nv --bind $verl_workdir $apptainer_image_path \
ray start --address "$ip_head" --num-cpus "${SLURM_CPUS_PER_TASK}" --num-gpus "${SLURM_GPUS_PER_NODE}" --block &
sleep 5
done
PYTHONUNBUFFERED=1 srun --overlap --nodes=1 --ntasks=1 -w "$head_node" \
apptainer run --nv --bind $verl_workdir $apptainer_image_path \
python3 -m verl.trainer.main_ppo \
data.train_files=$train_files \
data.val_files=$val_files \
data.train_batch_size=256 \
data.max_prompt_length=512 \
data.max_response_length=256 \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
critic.optim.lr=1e-5 \
critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \
critic.ppo_micro_batch_size_per_gpu=4 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.logger=['console'] \
+trainer.val_before_train=False \
trainer.default_hdfs_dir=null \
trainer.n_gpus_per_node="${SLURM_GPUS_PER_NODE}" \
trainer.nnodes="${SLURM_NNODES}" \
trainer.save_freq=10 \
trainer.test_freq=10 \
trainer.total_epochs=15 2>&1 | tee verl_demo_slurm.log
set -x
python3 main_ppo_split.py \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=8 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=8 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_example_gsm8k' \
trainer.experiment_name='deepseek_llm_7b_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.total_epochs=15 $@
#!/bin/bash
#- Job parameters
# (TODO)
# Please modify job name
#SBATCH -J to-model # The job name
#SBATCH -o ret_one/%j.out # Write the standard output to file named 'ret-<job_number>.out'
#SBATCH -e ret_one/%j.err # Write the standard error to file named 'ret-<job_number>.err'
#- Resources
# (TODO)
# Please modify your requirements
#SBATCH -p r8nv-gpu-hw # Submit to 'r8nv-gpu-hw' Partitiion
###SBATCH -p r8nv-gpu-dedicated # Submit to 'r8nv-gpu-hw' Partitiion
#SBATCH -t 1-05:59:59 # Run for a maximum time of 0 days, 12 hours, 00 mins, 00 secs
#SBATCH --nodes=1 # Request N nodes
#SBATCH --gres=gpu:8 # Request M GPU per node
###SBATCH --qos=normal # Request QOS Type
#SBATCH --qos=gpu-normal # Request QOS Type
###SBATCH --constraint=L40|L40S
###SBATCH --gres-flags=enforce-binding # CPU-GPU Affinity
###
### The system will alloc 8 or 16 cores per gpu by default.
### If you need more or less, use following:
### #SBATCH --cpus-per-task=K # Request K cores
###
###
### Without specifying the constraint, any available nodes that meet the requirement will be allocated
### You can specify the characteristics of the compute nodes, and even the names of the compute nodes
###
### #SBATCH --nodelist=r8a30-a0 # Request a specific list of hosts
### #SBATCH --constraint="A100|A30" # Request GPU Type: A30 or A100_40GB
###
#- Log information
echo "Job start at $(date "+%Y-%m-%d %H:%M:%S")"
echo "Job run at:"
echo "$(hostnamectl)"
echo "$(df -h | grep -v tmpfs)"
#- Important setting!!!
## otherwise it will cause an error of insufficient RDMA resources:
ulimit -l unlimited
## otherwise it will result in an insufficient virtual memory size error, especially when loading LLM:
ulimit -v unlimited
#- Load environments
source ~/.bashrc
source /tools/module_env.sh
module list # list modules loaded
##- Tools
module load cluster-tools/v1.0
module load slurm-tools/v1.0
##- language
# module load python3/3.8.16
##- CUDA
module unload cuda-cudnn
module load cuda-cudnn/11.8-8.8.1
##- virtualenv
# source xxxxx/activate
echo $(module list) # list modules loaded
echo $(which gcc)
echo $(which python)
echo $(which python3)
#- Other
cluster-quota # nas quota
nvidia-smi --format=csv --query-gpu=name,driver_version,power.limit # gpu info
#- WARNING! DO NOT MODIFY your CUDA_VISIBLE_DEVICES
#- in `.bashrc`, `env.sh`, or your job script
echo "Using GPU(s) ${CUDA_VISIBLE_DEVICES}" # which GPUs
#- The CUDA_VISIBLE_DEVICES variable is assigned and specified by SLURM
echo "This job is assigned the following resources by SLURM:"
scontrol show jobid $SLURM_JOB_ID -dd | awk '/IDX/ {print $2, $4}'
#- Main program execution
# conda activate plansearch
conda activate dapo
for step in 20 40
do
CKPT_PATH="results/run_dapo_codev_7b_16k_continuous_reward_0.0/global_step_$step/actor"
# CKPT_PATH="/nfs_global/S/zhangxiaoyun/dapo/verl/results/run_codev_7b_3.4k_dapo_2_node/global_step_$step/actor"
python scripts/model_merger.py --local_dir "$CKPT_PATH"
done
#- End
# slurm-gpu-atop-log-stats $SLURM_JOB_ID $CUDA_VISIBLE_DEVICES
echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
# This will overwrite any existing atop logs from previous runs.
# WARNING: If your program times out or is terminated by scancel,
# the above script part might not execute correctly.
#!/bin/bash
set -x
set -euxo pipefail
project_name='DAPO'
exp_name='DAPO-Early-Qwen2.5-32B'
adv_estimator=grpo
kl_coef=0.0
kl_loss_coef=0.0
clip_ratio_low=0.2
clip_ratio_high=0.28
enable_overlong_buffer=True
overlong_buffer_len=$((1024 * 1))
overlong_penalty_factor=1.0
# An early version for DAPO
enable_filter_groups=False
gen_prompt_bsz=512 # NOTE: no filtering here
train_prompt_bsz=512
train_prompt_mini_bsz=32
n_resp_per_prompt=16
use_token_level_loss=False
# Ray
RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
WORKING_DIR=${WORKING_DIR:-"${PWD}"}
RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
NNODES=${NNODES:-16}
# Paths
# Algorithm
## Train
max_prompt_length=$((1024 * 2))
max_response_length=$((1024 * 20))
## Validation
val_top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
# Performance Related Parameter
sp_size=8
use_dynamic_bsz=True
actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
offload=True
gen_tp=4
export VLLM_USE_V1=1
echo "$WANDB_DIR"
echo "$SAVE_DIR"
echo "$WANDB_API_KEY"
# Set default model path if not provided
MODEL_PATH="/share/collab/codemodel/models/Qwen2.5-Coder-7B-Instruct"
# Train over a single node, 8 A100-80GB GPUs.
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/1.6k/train.parquet \
data.val_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/1.6k/test.parquet \
data.train_batch_size=128 \
data.val_batch_size=512 \
data.max_prompt_length=1024 \
data.max_response_length=3072 \
actor_rollout_ref.model.path=$MODEL_PATH \
+actor_rollout_ref.model.override_config.attention_dropout=0. \
+actor_rollout_ref.model.override_config.embd_pdrop=0. \
+actor_rollout_ref.model.override_config.resid_pdrop=0. \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.optim.weight_decay=0.0 \
actor_rollout_ref.actor.use_dynamic_bsz=True\
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=12000 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.00 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.grad_clip=0.5 \
actor_rollout_ref.actor.use_token_level_loss=${use_token_level_loss} \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=24000 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.n=8 \
actor_rollout_ref.rollout.val_kwargs.n=2 \
actor_rollout_ref.rollout.temperature=1.0 \
actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
actor_rollout_ref.rollout.val_kwargs.do_sample=True \
actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
actor_rollout_ref.rollout.enforce_eager=False \
actor_rollout_ref.rollout.free_cache_engine=False \
reward_model.reward_manager=prime \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
custom_reward_function.overlong_buffer.enable=${enable_overlong_buffer} \
custom_reward_function.overlong_buffer.len=${overlong_buffer_len} \
custom_reward_function.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
custom_reward_function.path=verl/utils/reward_score/codev.py \
custom_reward_function.name=compute_score_wrapper \
custom_reward_function.continuous_reward.enable=True \
custom_reward_function.continuous_reward.error_ratio_threshold=0.5 \
algorithm.kl_ctrl.kl_coef=0.0 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='codev' \
trainer.experiment_name='codev-7b-16k' \
trainer.n_gpus_per_node=$USER_GPUS_PER_NODE \
trainer.nnodes=$SLURM_JOB_NUM_NODES \
+trainer.val_before_train=False \
trainer.default_local_dir=$SAVE_DIR \
trainer.resume_mode=auto \
trainer.default_hdfs_dir=null \
trainer.save_freq=20 \
trainer.test_freq=20 \
trainer.total_epochs=100 "${@:1}"
# custom_reward_function.path=/nfs_global/S/zhuyaoyu/projects/dapo/verl/utils/reward_score/codev.py \
\ No newline at end of file
#!/usr/bin/env bash
set -uxo pipefail
export VERL_HOME=${VERL_HOME:-"${HOME}/verl"}
export TRAIN_FILE=${TRAIN_FILE:-"${VERL_HOME}/data/dapo-math-17k.parquet"}
export TEST_FILE=${TEST_FILE:-"${VERL_HOME}/data/aime-2024.parquet"}
mkdir -p "${VERL_HOME}/data"
wget -O "${TRAIN_FILE}" "https://huggingface.co/datasets/BytedTsinghua-SIA/DAPO-Math-17k/resolve/main/data/dapo-math-17k.parquet?download=true"
wget -O "${TEST_FILE}" "https://huggingface.co/datasets/BytedTsinghua-SIA/AIME-2024/resolve/main/data/aime-2024.parquet?download=true"
\ No newline at end of file
#!/bin/bash
set -x
set -euxo pipefail
project_name='DAPO'
exp_name='DAPO-Early-Qwen2.5-32B'
adv_estimator=grpo
kl_coef=0.0
kl_loss_coef=0.0
clip_ratio_low=0.2
clip_ratio_high=0.28
enable_overlong_buffer=True
overlong_buffer_len=$((1024 * 4))
overlong_penalty_factor=1.0
# An early version for DAPO
enable_filter_groups=False
gen_prompt_bsz=512 # NOTE: no filtering here
train_prompt_bsz=512
train_prompt_mini_bsz=32
n_resp_per_prompt=16
use_token_level_loss=False
# Ray
RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
WORKING_DIR=${WORKING_DIR:-"${PWD}"}
RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
NNODES=${NNODES:-16}
# Paths
# Algorithm
## Train
max_prompt_length=$((1024 * 2))
max_response_length=$((1024 * 20))
## Validation
val_top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
# Performance Related Parameter
sp_size=8
use_dynamic_bsz=True
actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
offload=True
gen_tp=4
export VLLM_USE_V1=1
echo "$WANDB_DIR"
echo "$SAVE_DIR"
echo "$WANDB_API_KEY"
# Set default model path if not provided
MODEL_PATH="/nfs_global/S/lvhanqi/LLaMA-Factory/saves/Qwen2.5-Coder-7B-Instruct-codev-r1-87k/full/sft_6epoch"
# Train over a single node, 8 A100-80GB GPUs.
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/16k_r1_filtered/train.parquet \
data.val_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1_1/10k_qwq/test.parquet \
data.train_batch_size=128 \
data.val_batch_size=512 \
data.max_prompt_length=2048 \
data.max_response_length=16384 \
actor_rollout_ref.model.path=$MODEL_PATH \
+actor_rollout_ref.model.override_config.attention_dropout=0. \
+actor_rollout_ref.model.override_config.embd_pdrop=0. \
+actor_rollout_ref.model.override_config.resid_pdrop=0. \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.optim.weight_decay=0.0 \
actor_rollout_ref.actor.use_dynamic_bsz=True\
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=32768 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.00 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.grad_clip=0.5 \
actor_rollout_ref.actor.use_token_level_loss=${use_token_level_loss} \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=32768 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.n=16 \
actor_rollout_ref.rollout.val_kwargs.n=1 \
actor_rollout_ref.rollout.temperature=1.0 \
actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
actor_rollout_ref.rollout.val_kwargs.do_sample=True \
actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
actor_rollout_ref.rollout.enforce_eager=False \
actor_rollout_ref.rollout.free_cache_engine=False \
reward_model.reward_manager=prime \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
custom_reward_function.overlong_buffer.enable=${enable_overlong_buffer} \
custom_reward_function.overlong_buffer.len=${overlong_buffer_len} \
custom_reward_function.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
algorithm.kl_ctrl.kl_coef=0.0 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='codev' \
trainer.experiment_name='codev-7b-16k' \
trainer.n_gpus_per_node=$USER_GPUS_PER_NODE \
trainer.nnodes=$SLURM_JOB_NUM_NODES \
+trainer.val_before_train=False \
trainer.default_local_dir=$SAVE_DIR \
trainer.resume_mode=auto \
trainer.default_hdfs_dir=null \
trainer.save_freq=15 \
trainer.test_freq=200 \
trainer.total_epochs=100 "${@:1}"
\ No newline at end of file
#!/bin/bash
set -x
set -euxo pipefail
project_name='DAPO'
exp_name='DAPO-Early-Qwen2.5-32B'
adv_estimator=grpo
kl_coef=0.0
kl_loss_coef=0.0
clip_ratio_low=0.2
clip_ratio_high=0.28
enable_overlong_buffer=True
overlong_buffer_len=$((1024 * 4))
overlong_penalty_factor=1.0
# An early version for DAPO
enable_filter_groups=False
gen_prompt_bsz=512 # NOTE: no filtering here
train_prompt_bsz=512
train_prompt_mini_bsz=32
n_resp_per_prompt=16
use_token_level_loss=False
# Ray
RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
WORKING_DIR=${WORKING_DIR:-"${PWD}"}
RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
NNODES=${NNODES:-16}
# Paths
# Algorithm
## Train
max_prompt_length=$((1024 * 2))
max_response_length=$((1024 * 20))
## Validation
val_top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
# Performance Related Parameter
sp_size=8
use_dynamic_bsz=True
actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
offload=True
gen_tp=4
export VLLM_USE_V1=1
echo "$WANDB_DIR"
echo "$SAVE_DIR"
echo "$WANDB_API_KEY"
# Set default model path if not provided
MODEL_PATH="/nfs_global/S/lvhanqi/LLaMA-Factory/saves/Qwen2.5-Coder-7B-Instruct-codev-r1-87k/full/sft_6epoch"
# Train over a single node, 8 A100-80GB GPUs.
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/16k_r1_filtered/train.parquet \
data.val_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/16k_r1_filtered/test.parquet \
data.train_batch_size=128 \
data.val_batch_size=512 \
data.max_prompt_length=2048 \
data.max_response_length=16384 \
actor_rollout_ref.model.path=$MODEL_PATH \
+actor_rollout_ref.model.override_config.attention_dropout=0. \
+actor_rollout_ref.model.override_config.embd_pdrop=0. \
+actor_rollout_ref.model.override_config.resid_pdrop=0. \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.optim.weight_decay=0.0 \
actor_rollout_ref.actor.use_dynamic_bsz=True\
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=32768 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.00 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.grad_clip=0.5 \
actor_rollout_ref.actor.use_token_level_loss=${use_token_level_loss} \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=32768 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.n=16 \
actor_rollout_ref.rollout.val_kwargs.n=2 \
actor_rollout_ref.rollout.temperature=1.0 \
actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
actor_rollout_ref.rollout.val_kwargs.do_sample=True \
actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
actor_rollout_ref.rollout.enforce_eager=False \
actor_rollout_ref.rollout.free_cache_engine=False \
reward_model.reward_manager=prime \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
custom_reward_function.overlong_buffer.enable=${enable_overlong_buffer} \
custom_reward_function.overlong_buffer.len=${overlong_buffer_len} \
custom_reward_function.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
custom_reward_function.path=verl/utils/reward_score/codev.py \
custom_reward_function.name=compute_score_wrapper \
custom_reward_function.continuous_reward.enable=True \
custom_reward_function.continuous_reward.error_ratio_threshold=0.2 \
algorithm.kl_ctrl.kl_coef=0.0 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='codev' \
trainer.experiment_name='codev-7b-16k' \
trainer.n_gpus_per_node=$USER_GPUS_PER_NODE \
trainer.nnodes=$SLURM_JOB_NUM_NODES \
+trainer.val_before_train=False \
trainer.default_local_dir=$SAVE_DIR \
trainer.resume_mode=auto \
trainer.default_hdfs_dir=null \
trainer.save_freq=20 \
trainer.test_freq=20 \
trainer.total_epochs=100 "${@:1}"
\ No newline at end of file
#!/bin/bash
set -x
set -euxo pipefail
project_name='DAPO'
exp_name='DAPO-Early-Qwen2.5-32B'
adv_estimator=grpo
kl_coef=0.0
kl_loss_coef=0.0
clip_ratio_low=0.2
clip_ratio_high=0.28
enable_overlong_buffer=True
overlong_buffer_len=$((1024 * 4))
overlong_penalty_factor=1.0
# An early version for DAPO
enable_filter_groups=False
gen_prompt_bsz=512 # NOTE: no filtering here
train_prompt_bsz=512
train_prompt_mini_bsz=32
n_resp_per_prompt=16
use_token_level_loss=False
# Ray
RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
WORKING_DIR=${WORKING_DIR:-"${PWD}"}
RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
NNODES=${NNODES:-16}
# Paths
# Algorithm
## Train
max_prompt_length=$((1024 * 2))
max_response_length=$((1024 * 20))
## Validation
val_top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
# Performance Related Parameter
sp_size=8
use_dynamic_bsz=True
actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
offload=True
gen_tp=4
export VLLM_USE_V1=1
echo "$WANDB_DIR"
echo "$SAVE_DIR"
echo "$WANDB_API_KEY"
# Set default model path if not provided
MODEL_PATH="/nfs_global/S/lvhanqi/LLaMA-Factory/saves/Qwen2.5-Coder-7B-Instruct-codev-r1-87k/full/sft_6epoch"
# Train over a single node, 8 A100-80GB GPUs.
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/16k_r1_filtered/train.parquet \
data.val_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/16k_r1_filtered/test.parquet \
data.train_batch_size=128 \
data.val_batch_size=512 \
data.max_prompt_length=2048 \
data.max_response_length=16384 \
actor_rollout_ref.model.path=$MODEL_PATH \
+actor_rollout_ref.model.override_config.attention_dropout=0. \
+actor_rollout_ref.model.override_config.embd_pdrop=0. \
+actor_rollout_ref.model.override_config.resid_pdrop=0. \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.optim.weight_decay=0.0 \
actor_rollout_ref.actor.use_dynamic_bsz=True\
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=32768 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.00 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.grad_clip=0.5 \
actor_rollout_ref.actor.use_token_level_loss=${use_token_level_loss} \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=32768 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.n=16 \
actor_rollout_ref.rollout.val_kwargs.n=2 \
actor_rollout_ref.rollout.temperature=1.0 \
actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
actor_rollout_ref.rollout.val_kwargs.do_sample=True \
actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
actor_rollout_ref.rollout.enforce_eager=False \
actor_rollout_ref.rollout.free_cache_engine=False \
reward_model.reward_manager=prime \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
custom_reward_function.overlong_buffer.enable=${enable_overlong_buffer} \
custom_reward_function.overlong_buffer.len=${overlong_buffer_len} \
custom_reward_function.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
custom_reward_function.path=verl/utils/reward_score/codev.py \
custom_reward_function.name=compute_score_wrapper \
custom_reward_function.continuous_reward.enable=True \
custom_reward_function.continuous_reward.error_ratio_threshold=1.0 \
algorithm.kl_ctrl.kl_coef=0.0 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='codev' \
trainer.experiment_name='codev-7b-16k' \
trainer.n_gpus_per_node=$USER_GPUS_PER_NODE \
trainer.nnodes=$SLURM_JOB_NUM_NODES \
+trainer.val_before_train=False \
trainer.default_local_dir=$SAVE_DIR \
trainer.resume_mode=auto \
trainer.default_hdfs_dir=null \
trainer.save_freq=20 \
trainer.test_freq=20 \
trainer.total_epochs=100 "${@:1}"
\ No newline at end of file
#!/bin/bash
set -x
set -euxo pipefail
project_name='DAPO'
exp_name='DAPO-Early-Qwen2.5-32B'
adv_estimator=grpo
kl_coef=0.0
kl_loss_coef=0.0
clip_ratio_low=0.2
clip_ratio_high=0.28
enable_overlong_buffer=True
overlong_buffer_len=$((1024 * 4))
overlong_penalty_factor=1.0
# An early version for DAPO
enable_filter_groups=False
gen_prompt_bsz=512 # NOTE: no filtering here
train_prompt_bsz=512
train_prompt_mini_bsz=32
n_resp_per_prompt=16
use_token_level_loss=False
# Ray
RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
WORKING_DIR=${WORKING_DIR:-"${PWD}"}
RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
NNODES=${NNODES:-16}
# Paths
# Algorithm
## Train
max_prompt_length=$((1024 * 2))
max_response_length=$((1024 * 20))
## Validation
val_top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
# Performance Related Parameter
sp_size=8
use_dynamic_bsz=True
actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
offload=True
gen_tp=4
export VLLM_USE_V1=1
echo "$WANDB_DIR"
echo "$SAVE_DIR"
echo "$WANDB_API_KEY"
# Set default model path if not provided
MODEL_PATH="/nfs_global/S/lvhanqi/LLaMA-Factory/saves/Qwen2.5-Coder-7B-Instruct-codev-r1-87k/full/sft_6epoch"
# Train over a single node, 8 A100-80GB GPUs.
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/continuous_reward_20k_0.8_r1/train.parquet \
data.val_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/continuous_reward_20k_0.8_r1/test.parquet \
data.train_batch_size=128 \
data.val_batch_size=512 \
data.max_prompt_length=2048 \
data.max_response_length=16384 \
actor_rollout_ref.model.path=$MODEL_PATH \
+actor_rollout_ref.model.override_config.attention_dropout=0. \
+actor_rollout_ref.model.override_config.embd_pdrop=0. \
+actor_rollout_ref.model.override_config.resid_pdrop=0. \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.optim.weight_decay=0.0 \
actor_rollout_ref.actor.use_dynamic_bsz=True\
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=32768 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.00 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.grad_clip=0.5 \
actor_rollout_ref.actor.use_token_level_loss=${use_token_level_loss} \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=32768 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.n=16 \
actor_rollout_ref.rollout.val_kwargs.n=2 \
actor_rollout_ref.rollout.temperature=1.0 \
actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
actor_rollout_ref.rollout.val_kwargs.do_sample=True \
actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
actor_rollout_ref.rollout.enforce_eager=False \
actor_rollout_ref.rollout.free_cache_engine=False \
reward_model.reward_manager=prime \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
custom_reward_function.overlong_buffer.enable=${enable_overlong_buffer} \
custom_reward_function.overlong_buffer.len=${overlong_buffer_len} \
custom_reward_function.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
custom_reward_function.path=verl/utils/reward_score/codev.py \
custom_reward_function.name=compute_score_wrapper \
custom_reward_function.continuous_reward.enable=True \
custom_reward_function.continuous_reward.error_ratio_threshold=0.2 \
algorithm.kl_ctrl.kl_coef=0.0 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='codev' \
trainer.experiment_name='codev-7b-16k' \
trainer.n_gpus_per_node=$USER_GPUS_PER_NODE \
trainer.nnodes=$SLURM_JOB_NUM_NODES \
+trainer.val_before_train=False \
trainer.default_local_dir=$SAVE_DIR \
trainer.resume_mode=auto \
trainer.default_hdfs_dir=null \
trainer.save_freq=20 \
trainer.test_freq=20 \
trainer.total_epochs=100 "${@:1}"
\ No newline at end of file
#!/bin/bash
set -x
set -euxo pipefail
project_name='DAPO'
exp_name='DAPO-Early-Qwen2.5-32B'
adv_estimator=grpo
kl_coef=0.0
kl_loss_coef=0.0
clip_ratio_low=0.2
clip_ratio_high=0.28
enable_overlong_buffer=True
overlong_buffer_len=$((1024 * 4))
overlong_penalty_factor=1.0
# An early version for DAPO
enable_filter_groups=False
gen_prompt_bsz=512 # NOTE: no filtering here
train_prompt_bsz=512
train_prompt_mini_bsz=32
n_resp_per_prompt=16
use_token_level_loss=False
# Ray
RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
WORKING_DIR=${WORKING_DIR:-"${PWD}"}
RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
NNODES=${NNODES:-16}
# Paths
# Algorithm
## Train
max_prompt_length=$((1024 * 2))
max_response_length=$((1024 * 20))
## Validation
val_top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
# Performance Related Parameter
sp_size=8
use_dynamic_bsz=True
actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
offload=True
gen_tp=4
export VLLM_USE_V1=1
echo "$WANDB_DIR"
echo "$SAVE_DIR"
echo "$WANDB_API_KEY"
# Set default model path if not provided
MODEL_PATH="/nfs_global/S/lvhanqi/LLaMA-Factory/saves/Qwen2.5-Coder-7B-Instruct-codev-r1-87k/full/sft_6epoch"
# Train over a single node, 8 A100-80GB GPUs.
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/4.8k_r1_filtered/train.parquet \
data.val_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1_1/10k_qwq/test.parquet \
data.train_batch_size=128 \
data.val_batch_size=512 \
data.max_prompt_length=2048 \
data.max_response_length=16384 \
actor_rollout_ref.model.path=$MODEL_PATH \
+actor_rollout_ref.model.override_config.attention_dropout=0. \
+actor_rollout_ref.model.override_config.embd_pdrop=0. \
+actor_rollout_ref.model.override_config.resid_pdrop=0. \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.optim.lr=2e-6 \
actor_rollout_ref.actor.optim.weight_decay=0.0 \
actor_rollout_ref.actor.use_dynamic_bsz=True\
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=32768 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.00 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.grad_clip=0.5 \
actor_rollout_ref.actor.use_token_level_loss=${use_token_level_loss} \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=32768 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.n=16 \
actor_rollout_ref.rollout.val_kwargs.n=2 \
actor_rollout_ref.rollout.temperature=1.0 \
actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
actor_rollout_ref.rollout.val_kwargs.do_sample=True \
actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
actor_rollout_ref.rollout.enforce_eager=False \
actor_rollout_ref.rollout.free_cache_engine=False \
reward_model.reward_manager=prime \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
custom_reward_function.overlong_buffer.enable=${enable_overlong_buffer} \
custom_reward_function.overlong_buffer.len=${overlong_buffer_len} \
custom_reward_function.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
algorithm.kl_ctrl.kl_coef=0.0 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='codev' \
trainer.experiment_name='codev-7b-3k' \
trainer.n_gpus_per_node=$USER_GPUS_PER_NODE \
trainer.nnodes=$SLURM_JOB_NUM_NODES \
+trainer.val_before_train=True \
trainer.default_local_dir=$SAVE_DIR \
trainer.resume_mode=auto \
trainer.default_hdfs_dir=null \
trainer.save_freq=20 \
trainer.test_freq=20 \
trainer.total_epochs=100 "${@:1}"
\ No newline at end of file
#!/bin/bash
set -x
set -euxo pipefail
project_name='DAPO'
exp_name='DAPO-Early-Qwen2.5-32B'
adv_estimator=grpo
kl_coef=0.0
kl_loss_coef=0.0
clip_ratio_low=0.2
clip_ratio_high=0.28
enable_overlong_buffer=True
overlong_buffer_len=$((1024 * 4))
overlong_penalty_factor=1.0
# An early version for DAPO
enable_filter_groups=False
gen_prompt_bsz=512 # NOTE: no filtering here
train_prompt_bsz=512
train_prompt_mini_bsz=32
n_resp_per_prompt=16
use_token_level_loss=False
# Ray
RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
WORKING_DIR=${WORKING_DIR:-"${PWD}"}
RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
NNODES=${NNODES:-16}
# Paths
# Algorithm
## Train
max_prompt_length=$((1024 * 2))
max_response_length=$((1024 * 20))
## Validation
val_top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
# Performance Related Parameter
sp_size=8
use_dynamic_bsz=True
actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
offload=True
gen_tp=4
export VLLM_USE_V1=1
echo "$WANDB_DIR"
echo "$SAVE_DIR"
echo "$WANDB_API_KEY"
# Set default model path if not provided
MODEL_PATH="/nfs_global/S/lvhanqi/LLaMA-Factory/saves/Qwen2.5-Coder-7B-Instruct-codev-r1-87k/full/sft_6epoch"
# Train over a single node, 8 A100-80GB GPUs.
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=/nfs_global/S/zhangxiaoyun/verl/data/codev/v1/80k_r1/train.parquet \
data.val_files=/nfs_global/S/zhangxiaoyun/deepscaler_codev/data/codev/v1_1/10k_qwq/test.parquet \
data.train_batch_size=512 \
data.val_batch_size=512 \
data.max_prompt_length=2048 \
data.max_response_length=16384 \
actor_rollout_ref.model.path=$MODEL_PATH \
+actor_rollout_ref.model.override_config.attention_dropout=0. \
+actor_rollout_ref.model.override_config.embd_pdrop=0. \
+actor_rollout_ref.model.override_config.resid_pdrop=0. \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
actor_rollout_ref.actor.optim.weight_decay=0.0001 \
actor_rollout_ref.actor.use_dynamic_bsz=True\
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=32768 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.00 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.grad_clip=1.0 \
actor_rollout_ref.actor.use_token_level_loss=${use_token_level_loss} \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=32768 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.n=8 \
actor_rollout_ref.rollout.val_kwargs.n=2 \
actor_rollout_ref.rollout.temperature=0.9 \
actor_rollout_ref.rollout.val_kwargs.temperature=0.85 \
actor_rollout_ref.rollout.val_kwargs.do_sample=True \
actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
actor_rollout_ref.rollout.enforce_eager=False \
actor_rollout_ref.rollout.free_cache_engine=False \
reward_model.reward_manager=prime \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
custom_reward_function.overlong_buffer.enable=${enable_overlong_buffer} \
custom_reward_function.overlong_buffer.len=${overlong_buffer_len} \
custom_reward_function.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
algorithm.kl_ctrl.kl_coef=0.00 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='codev' \
trainer.experiment_name='codev-7b-80k' \
trainer.n_gpus_per_node=$USER_GPUS_PER_NODE \
trainer.nnodes=$SLURM_JOB_NUM_NODES \
+trainer.val_before_train=True \
trainer.default_local_dir=$SAVE_DIR \
trainer.resume_mode=auto \
trainer.default_hdfs_dir=null \
trainer.save_freq=20 \
trainer.test_freq=10 \
trainer.total_epochs=100 "${@:1}"
\ No newline at end of file
#!/bin/bash
set -x
set -euxo pipefail
project_name='DAPO'
exp_name='DAPO-Early-Qwen2.5-32B'
adv_estimator=grpo
kl_coef=0.0
kl_loss_coef=0.0
clip_ratio_low=0.2
clip_ratio_high=0.28
enable_overlong_buffer=True
overlong_buffer_len=$((1024 * 4))
overlong_penalty_factor=1.0
# An early version for DAPO
enable_filter_groups=False
gen_prompt_bsz=512 # NOTE: no filtering here
train_prompt_bsz=512
train_prompt_mini_bsz=32
n_resp_per_prompt=16
use_token_level_loss=False
# Ray
RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
WORKING_DIR=${WORKING_DIR:-"${PWD}"}
RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
NNODES=${NNODES:-16}
# Paths
# Algorithm
## Train
max_prompt_length=$((1024 * 2))
max_response_length=$((1024 * 20))
## Validation
val_top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
# Performance Related Parameter
sp_size=8
use_dynamic_bsz=True
actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
offload=True
gen_tp=4
export VLLM_USE_V1=1
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL_PATH="$2"
shift 2
;;
*)
break
;;
esac
done
# Set default model path if not provided
MODEL_PATH="/nfs_global/S/lvhanqi/LLaMA-Factory/saves/Qwen2.5-Coder-7B-Instruct-codev-r1-87k/full/sft_6epoch"
# Train over a single node, 8 A100-80GB GPUs.
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=/nfs_global/S/zhangxiaoyun/verl/data/codev/v1/80k_r1/train.parquet \
data.val_files=/nfs_global/S/zhangxiaoyun/deepscaler_codev/data/codev/v1_1/10k_qwq/test.parquet \
data.train_batch_size=64 \
data.val_batch_size=512 \
data.max_prompt_length=2048 \
data.max_response_length=8192 \
actor_rollout_ref.model.path=$MODEL_PATH \
+actor_rollout_ref.model.override_config.attention_dropout=0. \
+actor_rollout_ref.model.override_config.embd_pdrop=0. \
+actor_rollout_ref.model.override_config.resid_pdrop=0. \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
actor_rollout_ref.actor.optim.weight_decay=0.1 \
actor_rollout_ref.actor.use_dynamic_bsz=True\
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=32768 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.00 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.grad_clip=1.0 \
actor_rollout_ref.actor.use_token_level_loss=${use_token_level_loss} \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=32768 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.n=2 \
actor_rollout_ref.rollout.val_kwargs.n=1 \
actor_rollout_ref.rollout.temperature=1.0 \
actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
actor_rollout_ref.rollout.val_kwargs.do_sample=True \
actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
actor_rollout_ref.rollout.enforce_eager=False \
actor_rollout_ref.rollout.free_cache_engine=False \
reward_model.reward_manager=prime \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
custom_reward_function.overlong_buffer.enable=${enable_overlong_buffer} \
custom_reward_function.overlong_buffer.len=${overlong_buffer_len} \
custom_reward_function.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
algorithm.kl_ctrl.kl_coef=0.00 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='codev' \
trainer.experiment_name='codev-7b-80k' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
+trainer.val_before_train=True \
trainer.resume_mode=auto \
trainer.default_hdfs_dir=null \
trainer.save_freq=-1 \
trainer.test_freq=10 \
trainer.total_epochs=100 "${@:1}"
\ No newline at end of file
#!/usr/bin/env bash
set -euxo pipefail
project_name='DAPO'
exp_name='DAPO-Early-Qwen2.5-32B'
adv_estimator=grpo
kl_coef=0.0
kl_loss_coef=0.0
clip_ratio_low=0.2
clip_ratio_high=0.28
enable_overlong_buffer=True
overlong_buffer_len=$((1024 * 4))
overlong_penalty_factor=1.0
# An early version for DAPO
enable_filter_groups=False
gen_prompt_bsz=512 # NOTE: no filtering here
train_prompt_bsz=512
train_prompt_mini_bsz=32
n_resp_per_prompt=16
use_token_level_loss=False
# Ray
RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
WORKING_DIR=${WORKING_DIR:-"${PWD}"}
RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
NNODES=${NNODES:-16}
# Paths
RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-32B"}
CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
# Algorithm
## Train
max_prompt_length=$((1024 * 2))
max_response_length=$((1024 * 20))
## Validation
val_top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
# Performance Related Parameter
sp_size=8
use_dynamic_bsz=True
actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
offload=True
gen_tp=4
ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
--working-dir "${WORKING_DIR}" \
-- python3 -m verl.trainer.main_ppo \
data.train_files="${TRAIN_FILE}" \
data.val_files="${TEST_FILE}" \
data.prompt_key=prompt \
data.truncation='left' \
data.max_prompt_length=${max_prompt_length} \
data.max_response_length=${max_response_length} \
data.gen_batch_size=${gen_prompt_bsz} \
data.train_batch_size=${train_prompt_bsz} \
actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
algorithm.adv_estimator=${adv_estimator} \
algorithm.kl_ctrl.kl_coef=${kl_coef} \
actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
algorithm.filter_groups.enable=${enable_filter_groups} \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
actor_rollout_ref.model.path="${MODEL_PATH}" \
+actor_rollout_ref.model.override_config.attention_dropout=0. \
+actor_rollout_ref.model.override_config.embd_pdrop=0. \
+actor_rollout_ref.model.override_config.resid_pdrop=0. \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
actor_rollout_ref.actor.optim.weight_decay=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.grad_clip=1.0 \
actor_rollout_ref.actor.use_token_level_loss=${use_token_level_loss} \
actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
actor_rollout_ref.rollout.enable_chunked_prefill=True \
actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
actor_rollout_ref.rollout.val_kwargs.top_k="${val_top_k}" \
actor_rollout_ref.rollout.val_kwargs.top_p=1.0 \
actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
actor_rollout_ref.rollout.val_kwargs.n=1 \
actor_rollout_ref.rollout.val_kwargs.do_sample=True \
actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \
custom_reward_function.overlong_buffer.enable=${enable_overlong_buffer} \
custom_reward_function.overlong_buffer.len=${overlong_buffer_len} \
custom_reward_function.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
trainer.logger=['console','wandb'] \
trainer.project_name="${project_name}" \
trainer.experiment_name="${exp_name}" \
trainer.n_gpus_per_node=8 \
trainer.nnodes="${NNODES}" \
+trainer.val_before_train=True \
trainer.test_freq=5 \
trainer.save_freq=5 \
trainer.total_epochs=1 \
trainer.default_local_dir="${CKPTS_DIR}" \
trainer.resume_mode=auto
\ No newline at end of file
#!/usr/bin/env bash
set -euxo pipefail
project_name='DAPO'
exp_name='DAPO-Qwen2.5-32B'
adv_estimator=grpo
kl_coef=0.0
kl_loss_coef=0.0
clip_ratio_low=0.2
clip_ratio_high=0.28
enable_overlong_buffer=True
overlong_buffer_len=$((1024 * 4))
overlong_penalty_factor=1.0
enable_filter_groups=True
filter_groups_metric=acc
max_num_gen_batches=10
train_prompt_bsz=512
gen_prompt_bsz=$((train_prompt_bsz * 3))
n_resp_per_prompt=16
train_prompt_mini_bsz=32
use_token_level_loss=True
# Ray
RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
WORKING_DIR=${WORKING_DIR:-"${PWD}"}
RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
NNODES=${NNODES:-16}
# Paths
RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-32B"}
CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
# Algorithm
## Train
max_prompt_length=$((1024 * 2))
max_response_length=$((1024 * 20))
## Validation
val_top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
# Performance Related Parameter
sp_size=8
use_dynamic_bsz=True
actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
offload=True
gen_tp=4
ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
--working-dir "${WORKING_DIR}" \
-- python3 -m verl.trainer.main_ppo \
data.train_files="${TRAIN_FILE}" \
data.val_files="${TEST_FILE}" \
data.prompt_key=prompt \
data.truncation='left' \
data.max_prompt_length=${max_prompt_length} \
data.max_response_length=${max_response_length} \
data.gen_batch_size=${gen_prompt_bsz} \
data.train_batch_size=${train_prompt_bsz} \
actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
algorithm.adv_estimator=${adv_estimator} \
algorithm.kl_ctrl.kl_coef=${kl_coef} \
actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
algorithm.filter_groups.enable=${enable_filter_groups} \
algorithm.filter_groups.max_num_gen_batches=${max_num_gen_batches} \
algorithm.filter_groups.metric=${filter_groups_metric} \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${actor_ppo_max_token_len} \
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
actor_rollout_ref.model.path="${MODEL_PATH}" \
+actor_rollout_ref.model.override_config.attention_dropout=0. \
+actor_rollout_ref.model.override_config.embd_pdrop=0. \
+actor_rollout_ref.model.override_config.resid_pdrop=0. \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
actor_rollout_ref.actor.optim.weight_decay=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.grad_clip=1.0 \
actor_rollout_ref.actor.use_token_level_loss=${use_token_level_loss} \
actor_rollout_ref.actor.ulysses_sequence_parallel_size=${sp_size} \
actor_rollout_ref.rollout.gpu_memory_utilization=0.80 \
actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
actor_rollout_ref.rollout.enable_chunked_prefill=True \
actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
actor_rollout_ref.rollout.val_kwargs.top_k="${val_top_k}" \
actor_rollout_ref.rollout.val_kwargs.top_p=1.0 \
actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
actor_rollout_ref.rollout.val_kwargs.n=1 \
actor_rollout_ref.rollout.val_kwargs.do_sample=True \
actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
actor_rollout_ref.ref.ulysses_sequence_parallel_size=${sp_size} \
actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \
custom_reward_function.overlong_buffer.enable=${enable_overlong_buffer} \
custom_reward_function.overlong_buffer.len=${overlong_buffer_len} \
custom_reward_function.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
trainer.logger=['console','wandb'] \
trainer.project_name="${project_name}" \
trainer.experiment_name="${exp_name}" \
trainer.n_gpus_per_node=8 \
trainer.nnodes="${NNODES}" \
+trainer.val_before_train=True \
trainer.test_freq=5 \
trainer.save_freq=5 \
trainer.total_epochs=1 \
trainer.default_local_dir="${CKPTS_DIR}" \
trainer.resume_mode=auto
\ No newline at end of file
#!/usr/bin/env bash
set -euxo pipefail
project_name='DAPO'
exp_name='DAPO-Qwen2.5-7B-Math-Test'
adv_estimator=grpo
kl_coef=0.0
kl_loss_coef=0.0
clip_ratio_low=0.2
clip_ratio_high=0.28
enable_overlong_buffer=True
overlong_buffer_len=512
overlong_penalty_factor=1.0
enable_filter_groups=True
filter_groups_metric=acc
max_num_gen_batches=10
train_prompt_bsz=512
gen_prompt_bsz=$((train_prompt_bsz * 3))
train_prompt_mini_bsz=32
n_resp_per_prompt=16
use_token_level_loss=True
# Ray
RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
WORKING_DIR=${WORKING_DIR:-"${PWD}"}
RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
NNODES=${NNODES:-4}
# Paths
RAY_DATA_HOME=${RAY_DATA_HOME:-"${HOME}/verl"}
MODEL_PATH=${MODEL_PATH:-"${RAY_DATA_HOME}/models/Qwen2.5-Math-7B"}
CKPTS_DIR=${CKPTS_DIR:-"${RAY_DATA_HOME}/ckpts/${project_name}/${exp_name}"}
TRAIN_FILE=${TRAIN_FILE:-"${RAY_DATA_HOME}/data/dapo-math-17k.parquet"}
TEST_FILE=${TEST_FILE:-"${RAY_DATA_HOME}/data/aime-2024.parquet"}
# Algorithm
## Train
max_prompt_length=$((1024 * 2))
max_response_length=$((1024 * 2))
## Validation
val_top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
# Mathematically equivalent
use_dynamic_bsz=True
infer_micro_batch_size=null
train_micro_batch_size=null
offload=False
ray job submit --no-wait --runtime-env="${RUNTIME_ENV}" \
--working-dir "${WORKING_DIR}" \
-- python3 -m verl.trainer.main_ppo \
data.train_files="${TRAIN_FILE}" \
data.val_files="${TEST_FILE}" \
data.prompt_key=prompt \
data.truncation='left' \
data.max_prompt_length=${max_prompt_length} \
data.max_response_length=${max_response_length} \
data.gen_batch_size=${gen_prompt_bsz} \
data.train_batch_size=${train_prompt_bsz} \
data.truncation='left' \
actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
algorithm.adv_estimator=${adv_estimator} \
algorithm.kl_ctrl.kl_coef=${kl_coef} \
algorithm.filter_groups.enable=${enable_filter_groups} \
algorithm.filter_groups.metric=${filter_groups_metric} \
algorithm.filter_groups.max_num_gen_batches=${max_num_gen_batches} \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.ref.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=${use_dynamic_bsz} \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$((max_prompt_length + max_response_length)) \
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=$((max_prompt_length + max_response_length)) \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=$((max_prompt_length + max_response_length)) \
actor_rollout_ref.model.path="${MODEL_PATH}" \
+actor_rollout_ref.model.override_config.attention_dropout=0. \
+actor_rollout_ref.model.override_config.embd_pdrop=0. \
+actor_rollout_ref.model.override_config.resid_pdrop=0. \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.optim.lr_warmup_steps=10 \
actor_rollout_ref.actor.optim.weight_decay=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
actor_rollout_ref.actor.ppo_micro_batch_size=${train_micro_batch_size} \
actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=${offload} \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.grad_clip=1.0 \
actor_rollout_ref.actor.use_token_level_loss=${use_token_level_loss} \
actor_rollout_ref.actor.use_token_level_loss=True \
actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
actor_rollout_ref.rollout.gpu_memory_utilization=0.85 \
actor_rollout_ref.rollout.log_prob_micro_batch_size=${infer_micro_batch_size} \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.enable_chunked_prefill=True \
actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
actor_rollout_ref.rollout.val_kwargs.top_k="${val_top_k}" \
actor_rollout_ref.rollout.val_kwargs.top_p=1.0\
actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
actor_rollout_ref.rollout.val_kwargs.n=1 \
actor_rollout_ref.rollout.val_kwargs.do_sample=True \
actor_rollout_ref.ref.log_prob_micro_batch_size=${infer_micro_batch_size} \
actor_rollout_ref.ref.fsdp_config.param_offload=${offload} \
actor_rollout_ref.ref.ulysses_sequence_parallel_size=1 \
actor_rollout_ref.actor.fsdp_config.fsdp_size=-1 \
custom_reward_function.overlong_buffer.enable=${enable_overlong_buffer} \
custom_reward_function.overlong_buffer.len=${overlong_buffer_len} \
custom_reward_function.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
trainer.logger=['console','wandb'] \
trainer.project_name="${project_name}" \
trainer.experiment_name="${exp_name}" \
trainer.n_gpus_per_node=8 \
trainer.nnodes="${NNODES}" \
+trainer.val_before_train=True \
trainer.test_freq=2 \
trainer.save_freq=2 \
trainer.total_epochs=1 \
trainer.default_local_dir="${CKPTS_DIR}" \
trainer.resume_mode=disable
\ No newline at end of file
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
gsm8k_train_path=$HOME/data/gsm8k/train.parquet
gsm8k_test_path=$HOME/data/gsm8k/test.parquet
math_train_path=$HOME/data/math/train.parquet
math_test_path=$HOME/data/math/test.parquet
train_files="['$gsm8k_train_path', '$math_train_path']"
test_files="['$gsm8k_test_path', '$math_test_path']"
model_path=PRIME-RL/Eurus-2-7B-SFT
python3 -m recipe.prime.main_prime \
data.train_files="$train_files" \
data.val_files="$test_files" \
data.train_batch_size=64 \
data.val_batch_size=6312 \
data.max_prompt_length=1024 \
data.max_response_length=3072 \
data.filter_overlong_prompts=True \
data.filter_accuracy=True \
data.accuracy_lower_bound=0.2 \
data.accuracy_upper_bound=0.8 \
data.oversample_factor=4 \
actor_rollout_ref.model.path=$model_path \
actor_rollout_ref.actor.optim.lr=5e-7 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=True \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.n=4 \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
algorithm.adv_estimator=rloo \
reward_model.model.path=$model_path \
reward_model.micro_batch_size=8 \
reward_model.model.update=before \
reward_model.model.beta_train=0.05 \
reward_model.model.optim.lr=1e-6 \
reward_model.model.optim.grad_clip=10.0 \
reward_model.model.input_tokenizer=null \
reward_model.mini_batch_size=64 \
trainer.val_before_train=False \
trainer.logger=['console','wandb'] \
trainer.project_name='prime_example' \
trainer.experiment_name='Eurus-2-7B-SFT' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=10 \
trainer.total_epochs=15 $@
MODEL_PATH=Qwen/DeepSeek-R1-Distill-Qwen-1.5B
DATA_PATH=/workspace/datasets/r1_bench
# Eval Data Process
python3 -m recipe.r1.data_process \
--local_dir $DATA_PATH \
--tasks all
# Generation
python3 -m verl.trainer.main_generation \
trainer.nnodes=1 \
trainer.n_gpus_per_node=8 \
data.path=$DATA_PATH/test.parquet \
data.prompt_key=prompt \
data.batch_size=1024 \
data.n_samples=8 \
data.output_path=$DATA_PATH/test-output-8.parquet \
model.path=$MODEL_PATH \
rollout.temperature=0.6 \
rollout.top_p=0.95 \
rollout.prompt_length=1024 \
rollout.response_length=32768 \
rollout.tensor_model_parallel_size=1 \
rollout.gpu_memory_utilization=0.9 \
rollout.max_num_batched_tokens=65536
# Evaluation
python3 -m recipe.r1.main_eval \
data.path=$DATA_PATH/test-output-8.parquet \
data.prompt_key=prompt \
data.response_key=responses \
custom_reward_function.path=recipe/r1/reward_score.py \
custom_reward_function.name=reward_func
WARNING: Did not unuse /usr/share/Modules/modulefiles
No Modulefiles Currently Loaded.
Currently Loaded Modulefiles:
1) cluster-tools/v1.0 3) gcc/9.3.0
2) slurm-tools/v1.0 4) cuda-cudnn/11.8-8.8.1
/usr/bin/which: no python in (/tools/cluster-software/cuda-cudnn/cuda-11.8.0-8.8.1/bin:/tools/cluster-software/gcc/gcc-9.3.0/bin:/tools/cluster-software/slurm-tools/slurm-tools-v1.0/bin:/tools/cluster-software/cluster-tools/cluster-tools-v1.0/bin:/home/S/wuyt/.elan/bin:/home/S/wuyt/.cargo/bin:/home/S/wuyt/nfs_global/anaconda3/envs/deepscaler/bin:/home/S/wuyt/lustre/anaconda3/condabin:/home/S/wuyt/.local/bin:/home/S/wuyt/bin:/usr/share/Modules/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/nfs_global/S/wuyt/.local/bin:/nfs_global/S/wuyt/wuyt/git-lfs/git-lfs-3.2.0)
Currently Loaded Modulefiles:
1) cluster-tools/v1.0 3) gcc/9.3.0
2) slurm-tools/v1.0 4) cuda-cudnn/11.8-8.8.1
/usr/bin/which: no python in (/tools/cluster-software/cuda-cudnn/cuda-11.8.0-8.8.1/bin:/tools/cluster-software/gcc/gcc-9.3.0/bin:/tools/cluster-software/slurm-tools/slurm-tools-v1.0/bin:/tools/cluster-software/cluster-tools/cluster-tools-v1.0/bin:/home/S/wuyt/.elan/bin:/home/S/wuyt/.cargo/bin:/home/S/wuyt/nfs_global/anaconda3/envs/deepscaler/bin:/home/S/wuyt/lustre/anaconda3/condabin:/home/S/wuyt/.local/bin:/home/S/wuyt/bin:/usr/share/Modules/bin:/usr/local/bin:/usr/bin:/usr/local/sbin:/usr/sbin:/nfs_global/S/wuyt/.local/bin:/nfs_global/S/wuyt/wuyt/git-lfs/git-lfs-3.2.0)
wandb: Appending key for api.wandb.ai to your netrc file: /home/S/wuyt/.netrc
wandb: W&B API key is configured. Use `wandb login --relogin` to force relogin
wandb: Appending key for api.wandb.ai to your netrc file: /home/S/wuyt/.netrc
wandb: W&B API key is configured. Use `wandb login --relogin` to force relogin
[2025-04-01 10:04:11,489 W 634026 634026] global_state_accessor.cc:429: Retrying to get node with node ID 2ab9de94a7185bdf0132a2fa88197f4972316281e3d52a2ad135506d
2025-04-01 10:04:20,398 INFO dashboard_sdk.py:338 -- Uploading package gcs://_ray_pkg_443717d5f8aeb41f.zip.
2025-04-01 10:04:20,398 INFO packaging.py:575 -- Creating a file package for local module '.'.
train-multigpu.sh: line 220: 728523 Terminated copy_log_and_plot
chmod: changing permissions of '../tmp': Operation not permitted
cp: cannot create special file '../tmp/ray_wuyt/session_latest/sockets/plasma_store': File exists
cp: cannot create special file '../tmp/ray_wuyt/session_latest/sockets/raylet': File exists
/var/log/atop/atop_20250401 - stat raw file: No such file or directory
This source diff could not be displayed because it is too large. You can view the blob instead.
WARNING: Did not unuse /usr/share/Modules/modulefiles
No Modulefiles Currently Loaded.
Currently Loaded Modulefiles:
1) cluster-tools/v1.0 3) gcc/9.3.0
2) slurm-tools/v1.0 4) cuda-cudnn/11.8-8.8.1
Currently Loaded Modulefiles:
1) cluster-tools/v1.0 3) gcc/9.3.0
2) slurm-tools/v1.0 4) cuda-cudnn/11.8-8.8.1
wandb: Appending key for api.wandb.ai to your netrc file: /home/S/wuyt/.netrc
wandb: Appending key for api.wandb.ai to your netrc file: /home/S/wuyt/.netrc
wandb: W&B API key is configured. Use `wandb login --relogin` to force relogin
wandb: W&B API key is configured. Use `wandb login --relogin` to force relogin
[2025-04-02 01:06:35,166 W 3115729 3115729] global_state_accessor.cc:429: Retrying to get node with node ID 736407c8bd15a81bfe135e9345be09f72e478a7f772f20455efb29e7
2025-04-02 01:06:45,530 INFO dashboard_sdk.py:338 -- Uploading package gcs://_ray_pkg_55d2a6863b4a199c.zip.
2025-04-02 01:06:45,531 INFO packaging.py:575 -- Creating a file package for local module '.'.
train-multigpu.sh: line 223: 595967 Terminated copy_log_and_plot
Traceback (most recent call last):
File "/nfs_global/S/zhuyaoyu/projects/verl/plot_and_analyze/plot.py", line 298, in <module>
plot_data(args.folder, no_ratio=args.no_ratio)
File "/nfs_global/S/zhuyaoyu/projects/verl/plot_and_analyze/plot.py", line 282, in plot_data
plot_different_accuracy_ratio(folder)
File "/nfs_global/S/zhuyaoyu/projects/verl/plot_and_analyze/plot.py", line 127, in plot_different_accuracy_ratio
df = pd.read_csv(csv_file_path)
File "/workspace/S/zhuyaoyu/softwares/miniconda3/envs/verl/lib/python3.10/site-packages/pandas/io/parsers/readers.py", line 1026, in read_csv
return _read(filepath_or_buffer, kwds)
File "/workspace/S/zhuyaoyu/softwares/miniconda3/envs/verl/lib/python3.10/site-packages/pandas/io/parsers/readers.py", line 620, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "/workspace/S/zhuyaoyu/softwares/miniconda3/envs/verl/lib/python3.10/site-packages/pandas/io/parsers/readers.py", line 1620, in __init__
self._engine = self._make_engine(f, self.engine)
File "/workspace/S/zhuyaoyu/softwares/miniconda3/envs/verl/lib/python3.10/site-packages/pandas/io/parsers/readers.py", line 1898, in _make_engine
return mapping[engine](f, **self.options)
File "/workspace/S/zhuyaoyu/softwares/miniconda3/envs/verl/lib/python3.10/site-packages/pandas/io/parsers/c_parser_wrapper.py", line 93, in __init__
self._reader = parsers.TextReader(src, **kwds)
File "parsers.pyx", line 581, in pandas._libs.parsers.TextReader.__cinit__
pandas.errors.EmptyDataError: No columns to parse from file
chmod: changing permissions of '../tmp': Operation not permitted
cp: cannot create special file '../tmp/ray_wuyt/session_latest/sockets/plasma_store': File exists
cp: cannot create special file '../tmp/ray_wuyt/session_latest/sockets/raylet': File exists
/var/log/atop/atop_20250402 - stat raw file: No such file or directory
This source diff could not be displayed because it is too large. You can view the blob instead.
WARNING: Did not unuse /usr/share/Modules/modulefiles
No Modulefiles Currently Loaded.
Currently Loaded Modulefiles:
1) cluster-tools/v1.0 3) gcc/9.3.0
2) slurm-tools/v1.0 4) cuda-cudnn/11.8-8.8.1
wandb: Appending key for api.wandb.ai to your netrc file: /home/S/wuyt/.netrc
wandb: W&B API key is configured. Use `wandb login --relogin` to force relogin
2025-04-02 01:14:14,510 INFO dashboard_sdk.py:338 -- Uploading package gcs://_ray_pkg_55d2a6863b4a199c.zip.
2025-04-02 01:14:14,510 INFO packaging.py:575 -- Creating a file package for local module '.'.
Traceback (most recent call last):
File "/workspace/S/zhuyaoyu/softwares/miniconda3/envs/verl/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3805, in get_loc
return self._engine.get_loc(casted_key)
File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'reward/correct_0%_ratio'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/nfs_global/S/zhuyaoyu/projects/verl/plot_and_analyze/plot.py", line 298, in <module>
plot_data(args.folder, no_ratio=args.no_ratio)
File "/nfs_global/S/zhuyaoyu/projects/verl/plot_and_analyze/plot.py", line 282, in plot_data
plot_different_accuracy_ratio(folder)
File "/nfs_global/S/zhuyaoyu/projects/verl/plot_and_analyze/plot.py", line 131, in plot_different_accuracy_ratio
df[f'{col}_smoothed'] = smooth_data(df[col])
File "/workspace/S/zhuyaoyu/softwares/miniconda3/envs/verl/lib/python3.10/site-packages/pandas/core/frame.py", line 4102, in __getitem__
indexer = self.columns.get_loc(key)
File "/workspace/S/zhuyaoyu/softwares/miniconda3/envs/verl/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3812, in get_loc
raise KeyError(key) from err
KeyError: 'reward/correct_0%_ratio'
Traceback (most recent call last):
File "/workspace/S/zhuyaoyu/softwares/miniconda3/envs/verl/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3805, in get_loc
return self._engine.get_loc(casted_key)
File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
File "pandas/_libs/hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item
File "pandas/_libs/hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'reward/correct_0%_ratio'
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/nfs_global/S/zhuyaoyu/projects/verl/plot_and_analyze/plot.py", line 298, in <module>
plot_data(args.folder, no_ratio=args.no_ratio)
File "/nfs_global/S/zhuyaoyu/projects/verl/plot_and_analyze/plot.py", line 282, in plot_data
plot_different_accuracy_ratio(folder)
File "/nfs_global/S/zhuyaoyu/projects/verl/plot_and_analyze/plot.py", line 131, in plot_different_accuracy_ratio
df[f'{col}_smoothed'] = smooth_data(df[col])
File "/workspace/S/zhuyaoyu/softwares/miniconda3/envs/verl/lib/python3.10/site-packages/pandas/core/frame.py", line 4102, in __getitem__
indexer = self.columns.get_loc(key)
File "/workspace/S/zhuyaoyu/softwares/miniconda3/envs/verl/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3812, in get_loc
raise KeyError(key) from err
KeyError: 'reward/correct_0%_ratio'
train-multigpu.sh: line 223: 618003 Terminated copy_log_and_plot
Traceback (most recent call last):
File "/nfs_global/S/zhuyaoyu/projects/verl/plot_and_analyze/plot.py", line 298, in <module>
plot_data(args.folder, no_ratio=args.no_ratio)
File "/nfs_global/S/zhuyaoyu/projects/verl/plot_and_analyze/plot.py", line 282, in plot_data
plot_different_accuracy_ratio(folder)
File "/nfs_global/S/zhuyaoyu/projects/verl/plot_and_analyze/plot.py", line 131, in plot_different_accuracy_ratio
df[f'{col}_smoothed'] = smooth_data(df[col])
File "/nfs_global/S/zhuyaoyu/projects/verl/plot_and_analyze/plot.py", line 44, in smooth_data
return data.rolling(window=window_size, min_periods=1).mean()
File "/workspace/S/zhuyaoyu/softwares/miniconda3/envs/verl/lib/python3.10/site-packages/pandas/core/generic.py", line 12580, in rolling
return Rolling(
File "/workspace/S/zhuyaoyu/softwares/miniconda3/envs/verl/lib/python3.10/site-packages/pandas/core/window/rolling.py", line 170, in __init__
self._validate()
File "/workspace/S/zhuyaoyu/softwares/miniconda3/envs/verl/lib/python3.10/site-packages/pandas/core/window/rolling.py", line 1869, in _validate
super()._validate()
File "/workspace/S/zhuyaoyu/softwares/miniconda3/envs/verl/lib/python3.10/site-packages/pandas/core/window/rolling.py", line 181, in _validate
raise ValueError(
ValueError: min_periods 1 must be <= window 0
chmod: changing permissions of '../tmp': Operation not permitted
cp: cannot create special file '../tmp/ray_wuyt/session_latest/sockets/plasma_store': File exists
cp: cannot create special file '../tmp/ray_wuyt/session_latest/sockets/raylet': File exists
/var/log/atop/atop_20250402 - stat raw file: No such file or directory
This source diff could not be displayed because it is too large. You can view the blob instead.
WARNING: Did not unuse /usr/share/Modules/modulefiles
No Modulefiles Currently Loaded.
Currently Loaded Modulefiles:
1) cluster-tools/v1.0 3) gcc/9.3.0
2) slurm-tools/v1.0 4) cuda-cudnn/11.8-8.8.1
Currently Loaded Modulefiles:
1) cluster-tools/v1.0 3) gcc/9.3.0
2) slurm-tools/v1.0 4) cuda-cudnn/11.8-8.8.1
wandb: Appending key for api.wandb.ai to your netrc file: /home/S/wuyt/.netrc
wandb: W&B API key is configured. Use `wandb login --relogin` to force relogin
wandb: Appending key for api.wandb.ai to your netrc file: /home/S/wuyt/.netrc
wandb: W&B API key is configured. Use `wandb login --relogin` to force relogin
[2025-04-02 11:51:05,525 W 3127793 3127793] global_state_accessor.cc:429: Retrying to get node with node ID b26f7a02826b437d1ff5aec846a4896c59f24a33b9b656ec3cd16f56
2025-04-02 11:51:16,404 INFO dashboard_sdk.py:338 -- Uploading package gcs://_ray_pkg_75d347a5920743dc.zip.
2025-04-02 11:51:16,404 INFO packaging.py:575 -- Creating a file package for local module '.'.
train-multigpu.sh: line 223: 701843 Terminated copy_log_and_plot
chmod: changing permissions of '../tmp': Operation not permitted
cp: cannot create special file '../tmp/ray_wuyt/session_latest/sockets/plasma_store': File exists
cp: cannot create special file '../tmp/ray_wuyt/session_latest/sockets/raylet': File exists
/var/log/atop/atop_20250402 - stat raw file: No such file or directory
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Currently Loaded Modulefiles:
1) git/2.31.1 2) gcc/9.3.0 3) cmake/3.21.7
Currently Loaded Modulefiles:
1) git/2.31.1 3) cmake/3.21.7 5) slurm-tools/v1.0
2) gcc/9.3.0 4) cluster-tools/v1.0 6) cuda-cudnn/12.1-8.9.3
Job start at 2025-04-04 08:51:30
Job run at:
Static hostname: localhost.localdomain
Transient hostname: r8l40-a00.ib.future.cn
Icon name: computer-server
Chassis: server
Machine ID: 5a5f22d1ca484ec4bb0c3310c788be8b
Boot ID: 870c9831f3b64f2ca8b3258b37fb8613
Operating System: Rocky Linux 8.7 (Green Obsidian)
CPE OS Name: cpe:/o:rocky:rocky:8:GA
Kernel: Linux 4.18.0-425.10.1.el8_7.x86_64
Architecture: x86-64
Filesystem Size Used Avail Use% Mounted on
/dev/mapper/rl-root 376G 18G 358G 5% /
/dev/nvme4n1p1 3.5T 25G 3.5T 1% /local
/dev/nvme2n1p1 3.5T 29G 3.5T 1% /tmp
/dev/mapper/rl-var 512G 9.9G 502G 2% /var
/dev/nvme0n1p2 2.0G 366M 1.7G 18% /boot
/dev/nvme1n1p1 3.5T 43G 3.5T 2% /local/nfscache
/dev/nvme0n1p1 599M 5.8M 594M 1% /boot/efi
ssd.nas00.future.cn:/rocky8_home 16G 3.3G 13G 21% /home
ssd.nas00.future.cn:/rocky8_workspace 400G 239G 162G 60% /workspace
ssd.nas00.future.cn:/rocky8_tools 5.0T 75G 5.0T 2% /tools
ssd.nas00.future.cn:/centos7_home 16G 7.6G 8.5G 47% /centos7/home
ssd.nas00.future.cn:/centos7_workspace 400G 5.2G 395G 2% /centos7/workspace
ssd.nas00.future.cn:/centos7_tools 5.0T 235G 4.8T 5% /centos7/tools
ssd.nas00.future.cn:/eda-tools 8.0T 5.7T 2.4T 72% /centos7/eda-tools
hdd.nas00.future.cn:/share_personal 500G 414M 500G 1% /share/personal
zone05.nas01.future.cn:/NAS_HPC_collab_codemodel 34T 33T 858G 98% /share/collab/codemodel
ext-zone00.nas02.future.cn:/nfs_global 289T 276T 14T 96% /nfs_global
ssd.nas00.future.cn:/common_datasets 75T 63T 13T 84% /datasets
192.168.12.10@o2ib:192.168.12.11@o2ib:/lustre 1.9P 54T 1.7P 4% /lustre
beegfs_nodev 70T 15T 56T 21% /fast
Have already added /tools/cluster-modulefiles into $MODULEPATH
/tools/cluster-software/gcc/gcc-9.3.0/bin/gcc
/workspace/S/zhuyaoyu/softwares/miniconda3/bin/python
/workspace/S/zhuyaoyu/softwares/miniconda3/bin/python3
############### /home : /home/S/zhuyaoyu
Disk quotas for user zhuyaoyu (uid 6207):
Filesystem space quota limit grace files quota limit grace
/home 3353M 16384M 20480M 90671 0 0
############### /workspace
Disk quotas for user zhuyaoyu (uid 6207):
Filesystem space quota limit grace files quota limit grace
/workspace 239G 400G 500G 799k 0 0
############### /nfs_global
Disk quotas for user zhuyaoyu (uid 6207):
Filesystem space quota limit grace files quota limit grace
/nfs_global 2410G 5120G 7168G 2069k 5000k 10000k
############### /lustre
Disk quotas for usr zhuyaoyu (uid 6207):
Filesystem used quota limit grace files quota limit grace
/lustre 0k 8T 10T - 0 3000000 36000000 -
uid 6207 is using default block quota setting
uid 6207 is using default file quota setting
name, driver_version, power.limit [W]
NVIDIA L40, 550.54.15, 275.00 W
NVIDIA L40, 550.54.15, 275.00 W
NVIDIA L40, 550.54.15, 275.00 W
NVIDIA L40, 550.54.15, 275.00 W
NVIDIA L40, 550.54.15, 275.00 W
NVIDIA L40, 550.54.15, 275.00 W
NVIDIA L40, 550.54.15, 275.00 W
NVIDIA L40, 550.54.15, 275.00 W
Using GPU(s) 0,1,2,3,4,5,6,7
This job is assigned the following resources by SLURM:
CPU_IDs=0-31,56-87 GRES=gpu:8(IDX:0-7)
Have already added /tools/cluster-modulefiles into $MODULEPATH
Got device mesh tensor([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
dtype=torch.int32), mesh_dim_names ('fsdp',)
Processing model shards with 16 (16,) in total
Writing to local disk
Saving model to ckpt/codev_distill_16k_vllm1_v2/global_step_120/actor/huggingface
Got device mesh tensor([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
dtype=torch.int32), mesh_dim_names ('fsdp',)
Processing model shards with 16 (16,) in total
Writing to local disk
Saving model to ckpt/codev_distill_16k_vllm1_v2/global_step_140/actor/huggingface
Job end at 2025-04-04 08:53:38
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
#!/bin/bash
pip3 install --upgrade yapf
python3 -m yapf -ir -vv --style ./.style.yapf verl tests single_controller examples recipe
set -x
# the config file used: verl/trainer/main_ppo/config/ppo_megatron_trainer.yaml
huggingface-cli download deepseek-ai/deepseek-coder-1.3b-instruct
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=deepseek-ai/deepseek-coder-1.3b-instruct \
actor_rollout_ref.actor.optim.lr=2e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
critic.optim.lr=2e-5 \
critic.model.path=deepseek-ai/deepseek-coder-1.3b-instruct \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.megatron.pipeline_model_parallel_size=2 \
critic.megatron.virtual_pipeline_model_parallel_size=2 \
critic.megatron.tensor_model_parallel_size=2 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl_megatron_gsm8k_examples' \
trainer.experiment_name='deepseek_megatron_checkpoint_saveload' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=1 \
trainer.test_freq=1 \
trainer.total_epochs=15 \
trainer.total_training_steps=1 $@
python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=deepseek-ai/deepseek-coder-1.3b-instruct \
actor_rollout_ref.actor.optim.lr=2e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
critic.optim.lr=2e-5 \
critic.model.path=deepseek-ai/deepseek-coder-1.3b-instruct \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.megatron.pipeline_model_parallel_size=2 \
critic.megatron.virtual_pipeline_model_parallel_size=2 \
critic.megatron.tensor_model_parallel_size=2 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl_megatron_gsm8k_examples' \
trainer.experiment_name='deepseek_megatron_checkpoint_saveload' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.resume_mode=auto \
trainer.save_freq=-1 \
trainer.test_freq=1 \
trainer.total_epochs=15 \
trainer.total_training_steps=1 $@
\ No newline at end of file
set -x
# the config file used: verl/trainer/main_ppo/config/ppo_megatron_trainer.yaml
huggingface-cli download Qwen/Qwen2.5-0.5B
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B \
actor_rollout_ref.actor.optim.lr=2e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
critic.optim.lr=2e-5 \
critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.megatron.pipeline_model_parallel_size=2 \
critic.megatron.virtual_pipeline_model_parallel_size=2 \
critic.megatron.tensor_model_parallel_size=2 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl_megatron_gsm8k_examples' \
trainer.experiment_name='qwen2_5_0b5_megatron_saveload' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=1 \
trainer.test_freq=1 \
trainer.total_epochs=15 \
trainer.total_training_steps=1 $@
python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B \
actor_rollout_ref.actor.optim.lr=2e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
critic.optim.lr=2e-5 \
critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.megatron.pipeline_model_parallel_size=2 \
critic.megatron.virtual_pipeline_model_parallel_size=2 \
critic.megatron.tensor_model_parallel_size=2 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl_megatron_gsm8k_examples' \
trainer.experiment_name='qwen2_5_0b5_megatron_saveload' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.resume_mode=auto \
trainer.save_freq=-1 \
trainer.test_freq=1 \
trainer.total_epochs=15 \
trainer.total_training_steps=1 $@
\ No newline at end of file
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/usr/bin/env bash
set -e -x
torchrun --nproc-per-node=4 --standalone tests/distributed/test_tensor_dict.py
\ No newline at end of file
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=deepseek-ai/deepseek-coder-1.3b-instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=80 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=160 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=160 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl_grpo_example_gsm8k' \
trainer.experiment_name='deepseek_llm_7b_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=15 \
trainer.total_training_steps=2 $@
\ No newline at end of file
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
algorithm.adv_estimator=grpo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=1024 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=deepseek-ai/deepseek-coder-1.3b-instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl_grpo_example_gsm8k' \
trainer.experiment_name='deepseek_llm_7b_function_rm_math_megatron' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=15 \
trainer.total_training_steps=2 $@
\ No newline at end of file
set -x
# the config file used: verl/trainer/main_ppo/config/ppo_megatron_trainer.yaml
huggingface-cli download deepseek-ai/deepseek-coder-1.3b-instruct
python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=deepseek-ai/deepseek-coder-1.3b-instruct \
actor_rollout_ref.actor.optim.lr=2e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
critic.optim.lr=2e-5 \
critic.model.path=deepseek-ai/deepseek-coder-1.3b-instruct \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.megatron.tensor_model_parallel_size=2 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl_megatron_gsm8k_examples' \
trainer.experiment_name='deepseek_llm_1b3_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=1 \
trainer.total_epochs=15 \
trainer.total_training_steps=3 $@
set -x
# the config file used: verl/trainer/main_ppo/config/ppo_megatron_trainer.yaml
huggingface-cli download deepseek-ai/deepseek-coder-1.3b-instruct
python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=deepseek-ai/deepseek-coder-1.3b-instruct \
actor_rollout_ref.actor.optim.lr=2e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
critic.optim.lr=2e-5 \
critic.model.path=deepseek-ai/deepseek-coder-1.3b-instruct \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.megatron.pipeline_model_parallel_size=2 \
critic.megatron.virtual_pipeline_model_parallel_size=2 \
critic.megatron.tensor_model_parallel_size=2 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl_megatron_gsm8k_examples' \
trainer.experiment_name='deepseek_llm_1b3_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=1 \
trainer.total_epochs=15 \
trainer.total_training_steps=3 $@
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo \
data.train_files=$HOME/data/geo3k/train.parquet \
data.val_files=$HOME/data/geo3k/test.parquet \
data.train_batch_size=128 \
data.max_prompt_length=1536 \
data.max_response_length=1536 \
data.image_key=images \
actor_rollout_ref.model.path=Qwen/Qwen2-VL-2B-Instruct \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.rollout.enable_chunked_prefill=False \
actor_rollout_ref.rollout.enforce_eager=True \
actor_rollout_ref.rollout.free_cache_engine=False \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \
algorithm.adv_estimator=grpo \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl_example_geo3k' \
trainer.experiment_name='qwen2vl_e2e_ci_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.total_training_steps=1 $@
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=True \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl_grpo_example_gsm8k' \
trainer.experiment_name='qwen2_7b_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=15 \
trainer.total_training_steps=2 $@
\ No newline at end of file
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
algorithm.adv_estimator=grpo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=1024 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl_grpo_example_gsm8k' \
trainer.experiment_name='qwen2_7b_function_rm_megatron' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=15 \
trainer.total_training_steps=2 $@
\ No newline at end of file
#!/bin/bash
set -e -x
FILE="$(pwd)/my_reward_function.py"
rm -rf $FILE
cat <<EOF > "$FILE"
def my_reward_function(data_source, solution_str, ground_truth, extra_info=None):
print(f"Congratulations!!! You have called my_reward_function successfully!!!")
return 0.1
EOF
OUTPUT_FILE="$(pwd)/output_custom_reward.txt"
FUNCTION_NAME="my_reward_function"
rm -rf $OUTPUT_FILE
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
algorithm.kl_ctrl.kl_coef=0.001 \
custom_reward_function.path=$FILE\
custom_reward_function.name=$FUNCTION_NAME\
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl_example_gsm8k' \
trainer.experiment_name='qwen_e2e_ci_custom_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.default_local_dir=$HOME/ckpt/ \
trainer.total_training_steps=2 | tee $OUTPUT_FILE;
python3 tests/e2e/check_custom_rwd_fn.py --output_file=$OUTPUT_FILE
rm -rf $FILE
rm -rf $OUTPUT_FILE
\ No newline at end of file
#!/usr/bin/env bash
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
adv_estimator=grpo
kl_coef=0.0
use_kl_in_reward=False
use_kl_loss=False
kl_loss_coef=0.0
clip_ratio_low=0.2
clip_ratio_high=0.28
max_prompt_length=512
max_response_length=512
enable_overlong_buffer=True
overlong_buffer_len=128
overlong_penalty_factor=1.0
loss_agg_mode="token-mean"
enable_filter_groups=True
filter_groups_metric=seq_reward
max_num_gen_batches=10
train_prompt_bsz=32
train_prompt_mini_bsz=$((train_prompt_bsz / 2))
gen_prompt_bsz=$((train_prompt_bsz * 3))
n_resp_per_prompt=4
python3 -m recipe.dapo.src.main_dapo \
data.train_files="$HOME/data/gsm8k/train.parquet" \
data.val_files="$HOME/data/gsm8k/test.parquet" \
reward_model.reward_manager=dapo \
algorithm.adv_estimator=${adv_estimator} \
algorithm.use_kl_in_reward=${use_kl_in_reward} \
algorithm.kl_ctrl.kl_coef=${kl_coef} \
actor_rollout_ref.actor.use_kl_loss=${use_kl_loss} \
actor_rollout_ref.actor.kl_loss_coef=${kl_loss_coef} \
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
data.max_prompt_length=${max_prompt_length} \
data.max_response_length=${max_response_length} \
reward_model.overlong_buffer.enable=${enable_overlong_buffer} \
reward_model.overlong_buffer.len=${overlong_buffer_len} \
reward_model.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
data.train_batch_size=${train_prompt_bsz} \
data.gen_batch_size=${gen_prompt_bsz} \
algorithm.filter_groups.enable=${enable_filter_groups} \
algorithm.filter_groups.metric=${filter_groups_metric} \
algorithm.filter_groups.max_num_gen_batches=${max_num_gen_batches} \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
trainer.logger=['console'] \
trainer.project_name='verl_example_gsm8k' \
trainer.experiment_name='qwen2.5_0.5b_e2e_ci_dapo' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.total_training_steps=1 $@
set -x
ENGINE=${1:-vllm}
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=$ENGINE \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl_example_gsm8k' \
trainer.experiment_name='qwen_e2e_ci_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=1 \
trainer.default_local_dir=$HOME/$ENGINE/ckpt/ \
trainer.total_training_steps=1
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=gae \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
algorithm.use_kl_in_reward=True \
algorithm.kl_penalty=kl \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl_example_gsm8k' \
trainer.experiment_name='qwen_e2e_ci_function_rm_both_kl' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.default_local_dir=$HOME/ckpt/ \
trainer.total_training_steps=1 $@
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \
algorithm.adv_estimator=grpo \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl_example_gsm8k' \
trainer.experiment_name='qwen_e2e_ci_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.total_training_steps=1 $@
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=False \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=False \
critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
+trainer.val_before_train=False \
trainer.project_name='verl_example_gsm8k' \
trainer.experiment_name='qwen_e2e_ci_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.total_training_steps=1 $@
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \
algorithm.adv_estimator=remax \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl_example_gsm8k' \
trainer.experiment_name='qwen_e2e_ci_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.total_training_steps=1 $@
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
data.return_raw_chat=True \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.optim.lr_warmup_steps_ratio=0.05 \
critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
reward_model.enable=True \
reward_model.model.path=Qwen/Qwen2.5-0.5B\
reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size_per_gpu=16 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
+trainer.val_before_train=False \
trainer.project_name='verl_example' \
trainer.experiment_name='Qwen2.5-0.5B-ci_hybrid_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.total_training_steps=1 $@
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
data.return_raw_chat=True \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
+actor_rollout_ref.model.use_liger=True \
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size=32 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size=128 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size=128 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.optim.lr_warmup_steps_ratio=0.05 \
critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size=32 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
reward_model.enable=True \
reward_model.model.path=Qwen/Qwen2.5-0.5B\
reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size=16 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
+trainer.val_before_train=False \
trainer.project_name='verl_example' \
trainer.experiment_name='Qwen2.5-0.5B-ci_hybrid_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.total_training_steps=1 $@
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
data.return_raw_chat=True \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=False \
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=False \
critic.optim.lr_warmup_steps_ratio=0.05 \
critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
reward_model.enable=True \
reward_model.model.path=Qwen/Qwen2.5-0.5B\
reward_model.model.use_remove_padding=False \
reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size_per_gpu=16 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
+trainer.val_before_train=False \
trainer.logger=['console'] \
trainer.project_name='verl_example' \
trainer.experiment_name='Qwen2.5-0.5B-ci_hybrid_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.total_training_steps=1 $@
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
data.return_raw_chat=True \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.use_dynamic_bsz=True \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=12000 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=12000 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=12000 \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.optim.lr_warmup_steps_ratio=0.05 \
critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \
critic.use_dynamic_bsz=True \
critic.ppo_max_token_len_per_gpu=98304 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
reward_model.enable=True \
reward_model.model.path=Qwen/Qwen2.5-0.5B\
reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \
reward_model.use_dynamic_bsz=True \
reward_model.forward_max_token_len_per_gpu=98304 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
+trainer.val_before_train=False \
trainer.project_name='verl_example' \
trainer.experiment_name='Qwen2.5-0.5B-ci_hybrid_rm_seq_balance' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.total_training_steps=1 $@
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS # vllm + qwen2 with flash_attn has some issues
python3 -m verl.trainer.main_ppo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
data.return_raw_chat=True \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.optim.lr_warmup_steps_ratio=0.1 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.ulysses_sequence_parallel_size=2 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.actor.fsdp_config.fsdp_size=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.ulysses_sequence_parallel_size=2 \
critic.model.use_remove_padding=True \
critic.optim.lr_warmup_steps_ratio=0.05 \
critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
critic.model.fsdp_config.fsdp_size=4 \
reward_model.enable=True \
reward_model.ulysses_sequence_parallel_size=2 \
reward_model.model.path=Qwen/Qwen2.5-0.5B\
reward_model.model.use_remove_padding=True \
reward_model.model.fsdp_config.param_offload=True \
reward_model.micro_batch_size_per_gpu=16 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
+trainer.val_before_train=False \
trainer.logger=['console'] \
trainer.project_name='verl_example' \
trainer.experiment_name='Qwen2.5-0.5B-ci_hybrid_rm_sp2' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.total_training_steps=1 $@
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m recipe.prime.main_prime \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=32 \
data.max_prompt_length=512 \
data.max_response_length=512 \
data.filter_accuracy=True \
data.accuracy_lower_bound=0.2 \
data.accuracy_upper_bound=0.8 \
data.oversample_factor=4 \
data.return_raw_chat=True \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B \
actor_rollout_ref.actor.optim.lr=5e-7 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=32 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=1 \
actor_rollout_ref.model.enable_gradient_checkpointing=False \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.n=4 \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=1 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.adv_estimator=rloo \
reward_model.model.path=Qwen/Qwen2.5-0.5B \
reward_model.micro_batch_size_per_gpu=1 \
reward_model.model.update=before \
reward_model.model.beta_train=0.05 \
reward_model.model.optim.lr=1e-6 \
reward_model.model.optim.grad_clip=10.0 \
reward_model.model.input_tokenizer=null \
reward_model.mini_batch_size=32 \
reward_model.reward_manager=naive \
trainer.val_before_train=False \
trainer.logger=['console'] \
trainer.project_name='verl_example' \
trainer.experiment_name='Qwen2.5-0.5B-PRIME' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.total_training_steps=1 $@
set -x
# the config file used: verl/trainer/main_ppo/config/ppo_megatron_trainer.yaml
huggingface-cli download Qwen/Qwen2.5-0.5B
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B \
actor_rollout_ref.actor.optim.lr=2e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
critic.optim.lr=2e-5 \
critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.megatron.tensor_model_parallel_size=2 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl_megatron_gsm8k_examples' \
trainer.experiment_name='qwen2_5_0b5_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=1 \
trainer.total_epochs=15 \
trainer.total_training_steps=3 $@
set -x
# the config file used: verl/trainer/main_ppo/config/ppo_megatron_trainer.yaml
huggingface-cli download Qwen/Qwen2.5-0.5B
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo --config-path=config \
--config-name='ppo_megatron_trainer.yaml'\
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B \
actor_rollout_ref.actor.optim.lr=2e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.actor.megatron.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
actor_rollout_ref.ref.megatron.pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.virtual_pipeline_model_parallel_size=2 \
actor_rollout_ref.ref.megatron.tensor_model_parallel_size=2 \
critic.optim.lr=2e-5 \
critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
critic.megatron.pipeline_model_parallel_size=2 \
critic.megatron.virtual_pipeline_model_parallel_size=2 \
critic.megatron.tensor_model_parallel_size=2 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl_megatron_gsm8k_examples' \
trainer.experiment_name='qwen2_5_0b5_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=1 \
trainer.total_epochs=15 \
trainer.total_training_steps=3 $@
set -x
huggingface-cli download deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
--local-dir $HOME/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
python3 -m verl.trainer.main_generation \
trainer.nnodes=1 \
trainer.n_gpus_per_node=8 \
data.path=$HOME/data/r1/test.parquet \
data.prompt_key=prompt \
data.batch_size=1024 \
data.n_samples=1 \
data.output_path=$HOME/data/r1/test-output-k1.parquet \
model.path=$HOME/models/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B \
rollout.temperature=0.6 \
rollout.top_p=0.95 \
rollout.prompt_length=1024 \
rollout.response_length=32768 \
rollout.tensor_model_parallel_size=1 \
rollout.gpu_memory_utilization=0.95 \
rollout.max_num_batched_tokens=65536 \
rollout.enforce_eager=False \
rollout.free_cache_engine=False
python3 -m recipe.r1.main_eval \
data.path=$HOME/data/r1/test-output-k1.parquet \
data.prompt_key=prompt \
data.response_key=responses \
custom_reward_function.path=recipe/r1/reward_score.py \
custom_reward_function.name=reward_func
\ No newline at end of file
#!/usr/bin/env bash
set -e -x
OUTPUT_FILE="/tmp/output_ray_trainer.txt"
export PATH=$PATH:~/.local/bin
rm -rf $OUTPUT_FILE
python3 tests/e2e/arithmetic_sequence/rl/main_trainer.py \
data.train_files=tests/e2e/arithmetic_sequence/data/train.parquet \
data.val_files=tests/e2e/arithmetic_sequence/data/test.parquet \
data.train_batch_size=800 \
data.max_prompt_length=16 \
data.max_response_length=32 \
data.return_raw_input_ids=True \
actor_rollout_ref.model.path=tests/e2e/arithmetic_sequence/model \
actor_rollout_ref.model.external_lib=tests.e2e.envs.digit_completion \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=200 \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.optim.lr=1e-4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=200 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=200 \
actor_rollout_ref.rollout.name=hf \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
critic.ppo_micro_batch_size_per_gpu=200 \
critic.model.path=tests/e2e/arithmetic_sequence/model \
critic.optim.lr=1e-3 \
algorithm.kl_ctrl.kl_coef=0.005 \
trainer.total_epochs=200 \
trainer.experiment_name=arithmetic_sequences \
trainer.logger=['console'] \
trainer.n_gpus_per_node=1 \
trainer.test_freq=1 \
trainer.save_freq=110 | tee $OUTPUT_FILE;
python3 tests/e2e/check_results.py --output_file=$OUTPUT_FILE
rm -rf $OUTPUT_FILE
#!/usr/bin/env bash
set -e -x
OUTPUT_FILE="/tmp/output_ray_trainer.txt"
export PATH=$PATH:~/.local/bin
rm -rf $OUTPUT_FILE
python3 tests/e2e/arithmetic_sequence/rl/main_trainer.py \
data.train_files=tests/e2e/arithmetic_sequence/data/train.parquet \
data.val_files=tests/e2e/arithmetic_sequence/data/test.parquet \
data.train_batch_size=800 \
data.val_batch_size=200 \
data.max_prompt_length=16 \
data.max_response_length=32 \
data.return_raw_input_ids=True \
actor_rollout_ref.model.path=tests/e2e/arithmetic_sequence/model \
actor_rollout_ref.model.external_lib=tests.e2e.envs.digit_completion \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=200 \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.optim.lr=1e-4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=200 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=200 \
actor_rollout_ref.rollout.name=hf \
actor_rollout_ref.rollout.use_fire_sampling=True \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
critic.ppo_micro_batch_size_per_gpu=200 \
critic.model.path=tests/e2e/arithmetic_sequence/model \
critic.optim.lr=1e-3 \
algorithm.kl_ctrl.kl_coef=0.005 \
trainer.total_epochs=200 \
trainer.experiment_name=arithmetic_sequences \
trainer.logger=['console'] \
trainer.n_gpus_per_node=1 \
trainer.test_freq=1 \
trainer.save_freq=110 | tee $OUTPUT_FILE;
python3 tests/e2e/check_results.py --output_file=$OUTPUT_FILE
rm -rf $OUTPUT_FILE
#!/usr/bin/env bash
set -e -x
python3 tests/e2e/arithmetic_sequence/rl/main_trainer.py \
data.train_files=tests/e2e/arithmetic_sequence/data/train.parquet \
data.val_files=tests/e2e/arithmetic_sequence/data/test.parquet \
actor_rollout_ref.model.path=tests/e2e/arithmetic_sequence/model \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.model.tokenizer_path=tests/e2e/arithmetic_sequence/model \
critic.model.path=Qwen/Qwen2.5-0.5B \
critic.model.use_remove_padding=True \
trainer.total_epochs=1
\ No newline at end of file
# Tested with 1 & 4 GPUs
set -x
if [ "$#" -lt 2 ]; then
echo "Usage: run_gen_qwen05.sh <nproc_per_node> <save_path> [other_configs...]"
exit 1
fi
nproc_per_node=$1
save_path=$2
infer_tp=${3:-2} # Default tensor parallel size to 2
# Shift the arguments so $@ refers to the rest
shift 2
python3 -m verl.trainer.main_generation \
trainer.nnodes=1 \
trainer.n_gpus_per_node=$nproc_per_node \
data.path=$HOME/data/gsm8k/test.parquet \
data.prompt_key=prompt \
data.n_samples=1 \
data.output_path=$save_path \
model.path=Qwen/Qwen2.5-0.5B-Instruct \
+model.trust_remote_code=True \
rollout.temperature=1.0 \
rollout.top_k=50 \
rollout.top_p=0.7 \
rollout.prompt_length=2048 \
rollout.response_length=1024 \
rollout.tensor_model_parallel_size=$infer_tp \
rollout.gpu_memory_utilization=0.8
#!/bin/bash
if [ "$#" -ne 1 ]; then
echo "Usage: $0 YOUR_GITHUB_TOKEN"
echo "Please provide exactly one input argument for your github token."
exit 1
fi
# Set your GitHub repository details
OWNER="volcengine"
REPO="verl"
TOKEN=$1
# API URL for workflow runs
API_URL="https://api.github.com/repos/$OWNER/$REPO/actions/runs?status=queued"
# Check required commands
command -v jq >/dev/null 2>&1 || { echo "jq is required but not installed. Aborting."; exit 1; }
# Get queued workflow runs
response=$(curl -s -H "Authorization: token $TOKEN" -H "Accept: application/vnd.github.v3+json" "$API_URL")
# Run this for debugging
# echo $response
# Extract run IDs
queued_run_ids=$(echo "$response" | jq -r '.workflow_runs[] | .id')
if [ -z "$queued_run_ids" ]; then
echo "No queued workflow runs found."
exit 0
fi
# Cancel each queued run
for run_id in $queued_run_ids; do
echo "Cancelling run $run_id"
cancel_url="https://api.github.com/repos/$OWNER/$REPO/actions/runs/$run_id/cancel"
curl -s -X POST -H "Authorization: token $TOKEN" -H "Accept: application/vnd.github.v3+json" "$cancel_url"
done
echo "Cancelled all queued workflow runs."
#!/bin/bash
ray start --head --port=6379
python3 server.py
python3 client.py
ray stop --force
\ No newline at end of file
# Tested with 2 & 4 GPUs
set -x
torchrun --standalone --nnodes=1 --nproc_per_node=8 \
-m verl.trainer.fsdp_sft_trainer \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=extra_info \
data.response_key=extra_info \
+data.prompt_dict_keys=['question'] \
+data.response_dict_keys=['answer'] \
data.micro_batch_size_per_gpu=32 \
model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \
trainer.default_local_dir=$HOME/ckpts/ \
trainer.project_name=qwen2.5-sft \
trainer.experiment_name=gsm8k-sft-gemma-2b-it \
trainer.total_training_steps=1 \
trainer.logger=['console'] \
trainer.default_hdfs_dir=null $@
rm -rf $HOME/ckpts/
\ No newline at end of file
# Tested with 2 & 4 GPUs
set -x
if [ "$#" -lt 2 ]; then
echo "Usage: run_sft_qwen05_peft.sh <nproc_per_node> <save_path> [other_configs...]"
exit 1
fi
nproc_per_node=$1
save_path=$2
# Shift the arguments so $@ refers to the rest
shift 2
torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
-m verl.trainer.fsdp_sft_trainer \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=extra_info \
data.response_key=extra_info \
optim.lr=1e-4 \
+data.prompt_dict_keys=['question'] \
+data.response_dict_keys=['answer'] \
data.micro_batch_size_per_gpu=4 \
model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \
trainer.default_local_dir=$save_path \
trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-qwen-2.5-0.5b-instruct \
trainer.logger=['console'] \
trainer.total_training_steps=1 \
trainer.default_hdfs_dir=null $@ \
model.lora_rank=32\
model.lora_alpha=16 \
model.target_modules=all-linear
# Or you can do this:
# model.target_modules=[q_proj,v_proj] \
set -x
if [ "$#" -lt 2 ]; then
echo "Usage: run_sft_qwen05_sp2_liger.sh <nproc_per_node> <save_path> [other_configs...]"
exit 1
fi
nproc_per_node=$1
save_path=$2
# Shift the arguments so $@ refers to the rest
shift 2
torchrun --standalone --nnodes=1 --nproc_per_node=$nproc_per_node \
-m verl.trainer.fsdp_sft_trainer \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=extra_info \
data.response_key=extra_info \
optim.lr=1e-4 \
+data.prompt_dict_keys=['question'] \
+data.response_dict_keys=['answer'] \
data.micro_batch_size=4 \
model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \
model.use_liger=True \
trainer.default_local_dir=$save_path \
trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-qwen-2.5-0.5b-instruct-sp2-liger \
trainer.logger=['console'] \
trainer.total_training_steps=1 \
trainer.default_hdfs_dir=null $@ \
ulysses_sequence_parallel_size=2 \
use_remove_padding=true
\ No newline at end of file
# Tested with 2 & 4 GPUs
set -x
torchrun --standalone --nnodes=1 --nproc_per_node=8 \
tests/sft/test_sp_loss_match.py \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=extra_info \
data.response_key=extra_info \
+data.prompt_dict_keys=['question'] \
+data.response_dict_keys=['answer'] \
data.micro_batch_size=32 \
model.partial_pretrain=Qwen/Qwen2.5-0.5B-Instruct \
ulysses_sequence_parallel_size=2 \
use_remove_padding=True \
trainer.default_local_dir=$HOME/ckpts/ \
trainer.project_name=qwen2.5-sft \
trainer.experiment_name=gsm8k-sft-gemma-2b-it \
trainer.total_training_steps=1 \
trainer.logger=['console'] \
trainer.default_hdfs_dir=null $@
rm -rf $HOME/ckpts/
#!/bin/bash
# Extract device names and merge them into a comma-separated string
THIS_UP_IB_DEV=$(ibdev2netdev | grep Up | grep ib | awk '{print $1}' | paste -sd ',' -)
export NCCL_IB_HCA=$THIS_UP_IB_DEV
#- Log infomation
node_dev_msg="
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Task run on: $(hostname -s);
GPU devices: $(nvidia-smi --format=csv --query-gpu=name,driver_version,power.limit);
InfiniBand devices: $(ibdev2netdev);
NCCL_IB_HCA=$THIS_UP_IB_DEV;
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
"
node_task_msg="
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Task run on: $(hostname -s), PID: ${SLURM_TASK_PID},
USE GPU ${CUDA_VISIBLE_DEVICES} of this node (GPUs_PER_Node, not PER_Task);
GlobalID : $SLURM_PROCID of $SLURM_NTASKS,
NodeID : $SLURM_NODEID of $SLURM_JOB_NUM_NODES,
LocalID : $SLURM_LOCALID of $SLURM_NTASKS_PER_NODE;
GPUs_PER_Task = $USER_NGPUS / $SLURM_NTASKS = $(($USER_NGPUS/$SLURM_NTASKS)),
MASTER_ADDR = $MASTER_ADDR
MASTER_PORT = $MASTER_PORT
WORLD_SIZE = $WORLD_SIZE
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
"
echo $node_dev_msg
echo $node_task_msg
#- Important setting!!!
## otherwise it will cause an error of insufficient RDMA resources:
ulimit -l unlimited
## otherwise it will result in an insufficient virtual memory size error, especially when loading LLM:
ulimit -v unlimited
ulimit -n 65535
ulimit -u 4125556
#- Load environments
source /tools/module_env.sh
# source ~/.bashrc
##- language
# module load python3/3.8.16
module load gcc/9.3.0
##- CUDA
# module load cuda-cudnn/11.6-8.4.1
module unload cuda-cudnn
module load cuda-cudnn/11.8-8.8.1
export CUDA_HOME=/tools/cluster-software/cuda-cudnn/cuda-11.8.0-8.8.1
which nvcc
echo $CUDA_HOME
echo "Task $SLURM_PROCID: "$(module list) # list modules loaded
echo "Task $SLURM_PROCID: "$(which gcc)
echo "Task $SLURM_PROCID: "$(which python)
echo "Task $SLURM_PROCID: "$(which python3)
#- WARNING! DO NOT MODIFY your CUDA_VISIBLE_DEVICES
#- in `.bashrc`, `env.sh`, or your job script
echo "Node $SLURM_NODEID, LocalID $SLURM_LOCALID: Use GPU ${CUDA_VISIBLE_DEVICES}"
#- The CUDA_VISIBLE_DEVICES variable is assigned and specified by SLURM
##- Monitor
# The script continues executing other tasks while the following command will execute after a while
module load slurm-tools/v1.0
(sleep 3h && slurm-gpu-atop-log-stats $SLURM_JOB_ID $CUDA_VISIBLE_DEVICES) &
echo "Main program continues to run. Monitoring information will be exported after three hours."
#- Main program execution
##- virtualenv
# source xxxxx/activate
# source "/workspace/S/zhuyaoyu/softwares/miniconda3/etc/profile.d/conda.sh"
source activate /workspace/S/zhuyaoyu/softwares/miniconda3/envs/dapo/
export PATH="/workspace/S/zhuyaoyu/softwares/miniconda3/envs/dapo/:$PATH"
which python
# wandb login your_api_key!!!!
export WANDB_API_KEY='0a72cf472255879d3bad4939d3b39506e4a8573b'
wandb login $WANDB_API_KEY
export WANDB_MODE=offline
##- Job step TODO
# ray's default GCS(Global Control Store) port is 6379
# and default dashboard port is 8265
# need to set `"working_dir": "."` in --runtime-env-json, otherwise working_dir will set to ~(/home/S/your_name) by default
export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
export NCCL_DEBUG=INFO
export NCCL_TIMEOUT=120
export RAY_record_ref_creation_sites=1
export RAY_IGNORE_UNHANDLED_ERRORS=0
export HYDRA_FULL_ERROR=1
export PYTHONUNBUFFERED=TRUE
export VLLM_USE_V1=1
export RAY_TEMP_DIR="/tmp/ray_$SLURM_JOBID"
echo "RAY TEMP DIR is $RAY_TEMP_DIR"
export CURR_DIR=$(realpath .)
USER=$(whoami)
# export TORCH_EXTENSIONS_DIR="/workspace/S/$USER/.cache/torch_extensions"
# export HF_HOME="/workspace/S/$USER/.cache"
# export PIP_TOOLS_CACHE_DIR="/workspace/S/$USER/.cache/pip-tools"
# export TRITON_CACHE_DIR="/workspace/S/$USER/.triton/autotune"
echo "USER GPUS PER NODE IS $USER_GPUS_PER_NODE"
ray stop --force
MASTER_IP=$(nslookup $MASTER_ADDR | awk '/^Address: / { print $2 }')
DASHBOARD_PORT=$(($MASTER_PORT-10000))
DAL_PORT=$(($MASTER_PORT-20000))
RCS_PORT=$(($MASTER_PORT-30000))
RS_PORT=$(($MASTER_PORT-5000))
NM_PORT=$(($MASTER_PORT-15000))
OM_PORT=$(($MASTER_PORT-25000))
if [ "$SLURM_PROCID" -eq 0 ]; then
# launch the master node of ray in container
ray start --head --node-ip-address $MASTER_ADDR --port $MASTER_PORT --redis-shard-ports $RS_PORT --node-manager-port $NM_PORT --object-manager-port $OM_PORT --dashboard-port $DASHBOARD_PORT --dashboard-agent-listen-port $DAL_PORT --ray-client-server-port $RCS_PORT --num-gpus $USER_GPUS_PER_NODE --temp-dir=$RAY_TEMP_DIR # https://github.com/OpenRLHF/OpenRLHF/issues/339
fi
# sleep 99999
export RAY_START_TIMEOUT=180
# wait for master node
timeout $RAY_START_TIMEOUT bash -c "while ! nc -z $MASTER_ADDR ${MASTER_PORT}; do sleep 2; done"
if [ $? -ne 0 ]; then
echo "Ray start on master node time out!!!"
ray stop --force
exit 1
fi
if [ "$SLURM_PROCID" -ne 0 ]; then
# # if you want to launch ray on more nodes, use
ray start --address $MASTER_ADDR:$MASTER_PORT --node-manager-port $NM_PORT --object-manager-port $OM_PORT --dashboard-agent-listen-port $DAL_PORT --num-gpus $USER_GPUS_PER_NODE --temp-dir=$RAY_TEMP_DIR
fi
# wait for other nodes
timeout $RAY_START_TIMEOUT bash -c "while [ \$(ray status | grep -c 'node_') -lt \$SLURM_NTASKS ]; do sleep 2; done"
if [ $? -ne 0 ]; then
echo "Timeout waiting for worker nodes!"
ray stop --force
exit 1
fi
echo "All worker nodes are ready!"
ray status
# only need to submit job on the master node,
# and submitting on other nodes will cause network errors
if [ "$SLURM_PROCID" -eq 0 ]; then
ray list nodes
SCRIPT_TO_RUN="$CURR_DIR/recipe/dapo/run_dapo_codev_7b_16k_err_l1.0_continuous_reward.sh"
export SAVE_DIR="$CURR_DIR/results/run_dapo_codev_7b_16k_continuous_reward_0.0"
# SCRIPT_TO_RUN=recipe/dapo/run_dapo_codev_7b_20k_err_l0.2_r1_continuous_reward.sh
# export SAVE_DIR="$CURR_DIR/results/run_dapo_codev_7b_20k_continuous_reward"
# SCRIPT_TO_RUN=recipe/dapo/dapo_7b_test.sh
# export SAVE_DIR="$CURR_DIR/results/dapo_7b_test"
mkdir -p $SAVE_DIR
chmod 777 $SAVE_DIR
cp $SCRIPT_TO_RUN $SAVE_DIR
copy_log_and_plot() {
sleep 30m
while true; do
cp $CURR_DIR/ret_one/$SLURM_JOBID.* $SAVE_DIR && python $CURR_DIR/plot_and_analyze/plot.py --folder $SAVE_DIR
find $SAVE_DIR \( -type d -o -type f \) -exec chmod 777 {} +
sleep 3m # 每隔3分钟执行一次,你可以根据需要调整时间
done
}
copy_log_and_plot &
COPY_PID=$!
RUNTIME_ENV=$(jq -n --arg save_dir "$SAVE_DIR" --arg path "$PATH" '{
"pip": ["ray"],
"working_dir": ".",
"excludes": ["ckpt/", "xxx/", "ret_one/", "data/", "results/", ".git/"],
"disable_caching": true,
"env_vars": {"SAVE_DIR": $save_dir, "WANDB_DIR":$save_dir, "PATH":$path}
}')
ray job submit --address="http://127.0.0.1:$DASHBOARD_PORT" --runtime-env-json="$RUNTIME_ENV" -- bash $SCRIPT_TO_RUN
kill $COPY_PID
cp $CURR_DIR/ret_one/$SLURM_JOBID.* $SAVE_DIR && python $CURR_DIR/plot_and_analyze/plot.py --folder $SAVE_DIR
# sleep 48h
mkdir -p ../tmp/ray_$USER
chmod 777 ../tmp
cp -rfL $RAY_TEMP_DIR/session_latest ../tmp/ray_$USER/
ray stop --force
else
# echo "Worker node $SLURM_PROCID is waiting for head node to finish"
# Function to check connection to master node
check_connection() {
timeout 60 bash -c "while ! nc -z $MASTER_ADDR ${MASTER_PORT}; do sleep 5; done"
return $?
}
while true; do
if ! check_connection; then
echo "Connection to master node lost. Exiting worker node."
break
fi
sleep 60 # Check every 60 seconds
done
ray stop --force
fi
#- End
slurm-gpu-atop-log-stats $SLURM_JOB_ID $CUDA_VISIBLE_DEVICES
echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
# This will overwrite any existing atop logs from previous runs.
# WARNING: If your program times out or is terminated by scancel,
# the above script part might not execute correctly.
#!/bin/bash
#- Job parameters
# (TODO)
# Please modify job name
#SBATCH -J train-rl-%j # The job name
#SBATCH -o ret_one/%j.out # Write the standard output to file named 'ret-<job_number>.out'
#SBATCH -e ret_one/%j.err # Write the standard error to file named 'ret-<job_number>.err'
#- Resources
# (TODO)
# Please modify your requirements
#SBATCH --mem=0
#SBATCH --exclusive
#SBATCH --ntasks-per-node=1 # Request P tasks per node
###SBATCH --cpus-per-task=48 # Request Q core per task; means that P*Q cores per node
# SBATCH
#SBATCH -p r8nv-gpu-dedicated # Submit to 'r8nv-gpu-hw' Partitiion
#SBATCH -t 7-00:00:00 # Run for a maximum time of 0 days, 12 hours, 00 mins, 00 secs
#SBATCH --nodes=2 # Request N nodes
#SBATCH --gres=gpu:8 # Request M GPU per node
#SBATCH --qos=normal # Request QOS Type
#SBATCH --constraint=A100_80G
#SBATCH --exclude=r8a100-d[07]
#==========================================================================#
# Please add the SLURM parameter configuration above this horizontal line, #
# and read the README and F.A.Q. at the end of this document. #
#==========================================================================#
export USER_GPUS_PER_NODE=8 # <--------------------- Modify it in time!
export USER_NGPUS=$(($USER_GPUS_PER_NODE*$SLURM_JOB_NUM_NODES))
nodelist_h_format=$(scontrol show hostnames $SLURM_JOB_NODELIST | \
awk -v gpu=$USER_GPUS_PER_NODE '{printf ((NR>1?",":"")$0":%s"), gpu}')
#- Check
if [[ -z $SLURM_NTASKS ]]; then
echo "SLURM_NTASKS is empty, please check your SBATCH parameter."
exit -1
fi
if [[ -z $SLURM_NTASKS_PER_NODE ]]; then
echo "SLURM_NTASKS_PER_NODE is empty, please check your SBATCH parameter."
exit -1
fi
task_size=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE))
if [[ $task_size != $SLURM_NTASKS ]]; then
echo "NTASKS_PER_NODE * NNODE != NNTASK, please check your SBATCH parameter."
exit -1
fi
if [[ $task_size != $USER_NGPUS ]]; then
echo "INFO..."
echo "That's a total of $SLURM_NTASKS tasks, requiring a total of $USER_NGPUS GPUs"
echo "Becareful whether your program requires \$SLURM_NTASKS or NGPUS"
fi
#- Global Info
master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_ADDR=$master_addr
export MASTER_PORT=$(expr 50000 + $(echo -n $SLURM_JOBID | tail -c 4))
### (TODO)
### Warning:
### Sometimes WORLD_SIZE is not the number of tasks, it may be the number of GPUs,
### or even other values which need to be decided according to the actual situation
###
### export WORLD_SIZE=${USER_NGPUS}
### or
### export WORLD_SIZE=${task_size}
###
export WORLD_SIZE=${USER_NGPUS}
#- NCCL Setting
###
### IB here refers to RDMA, not the InfiniBand network in the narrow sense,
### it consists of RDMA over IB network, or RDMA over Converged Ethernet
###
### RDMA's advantages: Zero-Copy and Kernel Bypass, make it faster than TCP stack
### Since the cluster is basically configured with IB NICs, the best performance is obtained when using RDMA
###
### The NCCL_DEBUG variable controls the debug information that is displayed from NCCL
### INFO - Prints debug information
### export NCCL_DEBUG="INFO"
###
export NCCL_IB_DISABLE=0 # 0: Using RDMA, 1: Using TCP/IP
export NCCL_P2P_DISABLE=0 # 0: Using P2P, 1: Not P2P, using cpu forwarding (high latency)
export NCCL_IB_CUDA_SUPPORT=1
export NCCL_NET_GDR_LEVEL=2
export NCCL_IB_HCA="mlx5_0,mlx5_1,mlx5_2,mlx5_3"
#- Log information
echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX "
echo "Nodelist:= " $SLURM_JOB_NODELIST
echo "Nodelistname:= " $nodelist_h_format
echo "Number of nodes:= " $SLURM_JOB_NUM_NODES
echo "Ntasks per node:= " $SLURM_NTASKS_PER_NODE
echo "Ntasks of jobs:= " $SLURM_NTASKS
echo "NGPUs of jobs:= " $USER_NGPUS
echo "MASTER_ADDR:= " $MASTER_ADDR
echo "MASTER_PORT:= " $MASTER_PORT
echo "WORLD_SIZE:= " $WORLD_SIZE
echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX "
echo "Job start at $(date "+%Y-%m-%d %H:%M:%S")"
echo "The job is triggered on node:"
echo "$(hostnamectl)"
#- Load environments
source /tools/module_env.sh
module list # list modules loaded
#- Tools
module load cluster-tools/v1.0
module load slurm-tools/v1.0
echo "$(df -h | grep -v tmpfs)"
cluster-quota # nas quota
#- Job step
# (TODO) Be sure to modify the template.multi-gpus-task.sh file as well.
echo "slurm procid is" $SLURM_PROCID
echo "=============== srun begins =================="
srun bash train-multigpu.sh
#- End
echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
#!/bin/bash
#- Job parameters
# (TODO)
# Please modify job name
#SBATCH -J train-rl-%j # The job name
#SBATCH -o ret_one/%j.out # Write the standard output to file named 'ret-<job_number>.out'
#SBATCH -e ret_one/%j.err # Write the standard error to file named 'ret-<job_number>.err'
#- Resources
# (TODO)
# Please modify your requirements
#SBATCH --mem=0
#SBATCH --exclusive
#SBATCH --ntasks-per-node=1 # Request P tasks per node
###SBATCH --cpus-per-task=48 # Request Q core per task; means that P*Q cores per node
# SBATCH
#SBATCH -p r8nv-gpu-hw # Submit to 'r8nv-gpu-hw' Partitiion
#SBATCH -t 1-05:59:59 # Run for a maximum time of 0 days, 12 hours, 00 mins, 00 secs
#SBATCH --nodes=1 # Request N nodes
#SBATCH --gres=gpu:8 # Request M GPU per node
#SBATCH --qos=gpu-normal # Request QOS Type
#SBATCH --nodelist=r8l40-a01
#SBATCH --exclude=r8a100-d[07]
#==========================================================================#
# Please add the SLURM parameter configuration above this horizontal line, #
# and read the README and F.A.Q. at the end of this document. #
#==========================================================================#
export USER_GPUS_PER_NODE=8 # <--------------------- Modify it in time!
export USER_NGPUS=$(($USER_GPUS_PER_NODE*$SLURM_JOB_NUM_NODES))
nodelist_h_format=$(scontrol show hostnames $SLURM_JOB_NODELIST | \
awk -v gpu=$USER_GPUS_PER_NODE '{printf ((NR>1?",":"")$0":%s"), gpu}')
#- Check
if [[ -z $SLURM_NTASKS ]]; then
echo "SLURM_NTASKS is empty, please check your SBATCH parameter."
exit -1
fi
if [[ -z $SLURM_NTASKS_PER_NODE ]]; then
echo "SLURM_NTASKS_PER_NODE is empty, please check your SBATCH parameter."
exit -1
fi
task_size=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE))
if [[ $task_size != $SLURM_NTASKS ]]; then
echo "NTASKS_PER_NODE * NNODE != NNTASK, please check your SBATCH parameter."
exit -1
fi
if [[ $task_size != $USER_NGPUS ]]; then
echo "INFO..."
echo "That's a total of $SLURM_NTASKS tasks, requiring a total of $USER_NGPUS GPUs"
echo "Becareful whether your program requires \$SLURM_NTASKS or NGPUS"
fi
#- Global Info
master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_ADDR=$master_addr
export MASTER_PORT=$(expr 50000 + $(echo -n $SLURM_JOBID | tail -c 4))
### (TODO)
### Warning:
### Sometimes WORLD_SIZE is not the number of tasks, it may be the number of GPUs,
### or even other values which need to be decided according to the actual situation
###
### export WORLD_SIZE=${USER_NGPUS}
### or
### export WORLD_SIZE=${task_size}
###
export WORLD_SIZE=${USER_NGPUS}
#- NCCL Setting
###
### IB here refers to RDMA, not the InfiniBand network in the narrow sense,
### it consists of RDMA over IB network, or RDMA over Converged Ethernet
###
### RDMA's advantages: Zero-Copy and Kernel Bypass, make it faster than TCP stack
### Since the cluster is basically configured with IB NICs, the best performance is obtained when using RDMA
###
### The NCCL_DEBUG variable controls the debug information that is displayed from NCCL
### INFO - Prints debug information
### export NCCL_DEBUG="INFO"
###
export NCCL_IB_DISABLE=0 # 0: Using RDMA, 1: Using TCP/IP
export NCCL_P2P_DISABLE=0 # 0: Using P2P, 1: Not P2P, using cpu forwarding (high latency)
export NCCL_IB_CUDA_SUPPORT=1
export NCCL_NET_GDR_LEVEL=2
export NCCL_IB_HCA="mlx5_0,mlx5_1,mlx5_2,mlx5_3"
#- Log information
echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX "
echo "Nodelist:= " $SLURM_JOB_NODELIST
echo "Nodelistname:= " $nodelist_h_format
echo "Number of nodes:= " $SLURM_JOB_NUM_NODES
echo "Ntasks per node:= " $SLURM_NTASKS_PER_NODE
echo "Ntasks of jobs:= " $SLURM_NTASKS
echo "NGPUs of jobs:= " $USER_NGPUS
echo "MASTER_ADDR:= " $MASTER_ADDR
echo "MASTER_PORT:= " $MASTER_PORT
echo "WORLD_SIZE:= " $WORLD_SIZE
echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX "
echo "Job start at $(date "+%Y-%m-%d %H:%M:%S")"
echo "The job is triggered on node:"
echo "$(hostnamectl)"
#- Load environments
source /tools/module_env.sh
module list # list modules loaded
#- Tools
module load cluster-tools/v1.0
module load slurm-tools/v1.0
echo "$(df -h | grep -v tmpfs)"
cluster-quota # nas quota
#- Job step
# (TODO) Be sure to modify the template.multi-gpus-task.sh file as well.
echo "slurm procid is" $SLURM_PROCID
echo "=============== srun begins =================="
srun bash train-multigpu.sh
#- End
echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment