Unverified Commit 77f065ea by HL Committed by GitHub

example: fix the gemma2 example, update NGC dockerfile (#291)

parent 0dfcb7f9
# docker buildx build --platform linux/x86_64 -t "verlai/verl:ngc-th2.4.0-cu124-vllm0.6.3-ray2.4-te1.7-v0.0.6" -f docker/Dockerfile.ngc.vllm . --builder cloud-verlai-verl-builder --progress=plain --push
FROM nvcr.io/nvidia/pytorch:24.05-py3 FROM nvcr.io/nvidia/pytorch:24.05-py3
# uninstall nv-pytorch fork # uninstall nv-pytorch fork
...@@ -11,33 +12,36 @@ RUN pip3 uninstall pytorch-quantization \ ...@@ -11,33 +12,36 @@ RUN pip3 uninstall pytorch-quantization \
RUN pip3 install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu124 RUN pip3 install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu124
# make sure torch version is kept # =============== Megatron dependencies (optional) =================
# install apex, set MAX_JOBS to avoid OOMs
RUN MAX_JOBS=4 pip3 install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
--config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" \
git+https://github.com/NVIDIA/apex
# =============== End of Megatron dependencies (optional) =================
RUN pip3 install --no-cache-dir \ RUN pip3 install --no-cache-dir \
"torch==2.4.0" \
accelerate \ accelerate \
codetiming \ codetiming \
datasets \ datasets \
dill \ dill \
hydra-core \ hydra-core \
numpy \ numpy \
pybind11 \ 'pandas' \
tensordict \ 'peft' \
"transformers<=4.46.0" 'pyarrow>=15.0.0' \
'pybind11' \
# ray is installed via vllm 'pylatexenc' \
RUN pip3 install --no-cache-dir vllm==0.6.3 'ray>=2.10' \
'tensordict<0.6' \
# we choose flash-attn v2.7.0 or v2.7.2 which contain pre-built wheels 'transformers' \
RUN pip3 install --no-cache-dir --no-build-isolation flash-attn==2.7.0.post2 'vllm==0.6.3.post1' \
'wandb'
# install apex, set MAX_JOBS to avoid OOMs
RUN MAX_JOBS=4 pip3 install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \ # full dependencies
--config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" \ RUN pip3 install pytest yapf py-spy pyext liger-kernel
git+https://github.com/NVIDIA/apex
# =============== Megatron dependencies (optional) =================
# install Transformer Engine, which requires FA 2.5.8 # install Transformer Engine, which requires FA 2.5.8. Do it in a separate step for docker cache
RUN MAX_JOBS=4 NINJA_FLAGS="-j4" pip3 install flash-attn==2.5.8 --no-cache-dir --no-build-isolation RUN MAX_JOBS=4 NINJA_FLAGS="-j4" pip3 install flash-attn==2.5.8 --no-cache-dir --no-build-isolation
RUN MAX_JOBS=4 NINJA_FLAGS="-j4" pip3 install git+https://github.com/NVIDIA/TransformerEngine.git@v1.7 RUN MAX_JOBS=1 NINJA_FLAGS="-j1" TE_BUILD_WITH_NINJA=0 pip3 install git+https://github.com/eric-haibin-lin/TransformerEngine.git@v1.7.0
# =============== End of Megatron dependencies (optional) =================
# Pin wandb to v0.18 since v0.19.1 is released with ImportError
RUN pip3 install wandb==0.18.7 py-spy
...@@ -9,7 +9,7 @@ python3 -m verl.trainer.main_ppo \ ...@@ -9,7 +9,7 @@ python3 -m verl.trainer.main_ppo \
data.max_response_length=512 \ data.max_response_length=512 \
actor_rollout_ref.model.path=google/gemma-2-2b-it \ actor_rollout_ref.model.path=google/gemma-2-2b-it \
actor_rollout_ref.actor.optim.lr=1e-6 \ actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \ actor_rollout_ref.model.use_remove_padding=False \
actor_rollout_ref.actor.ppo_mini_batch_size=128 \ actor_rollout_ref.actor.ppo_mini_batch_size=128 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \ actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \ actor_rollout_ref.actor.fsdp_config.param_offload=False \
...@@ -21,7 +21,7 @@ python3 -m verl.trainer.main_ppo \ ...@@ -21,7 +21,7 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \ actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \ critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \ critic.model.use_remove_padding=False \
critic.model.path=google/gemma-2-2b-it \ critic.model.path=google/gemma-2-2b-it \
critic.model.enable_gradient_checkpointing=False \ critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \ critic.ppo_micro_batch_size_per_gpu=4 \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment