example: fix the gemma2 example, update NGC dockerfile (#291)

77f065ea · HL · GitHub · 0dfcb7f9 · 77f065ea · 77f065ea
Unverified Commit 77f065ea authored Feb 18, 2025 by HL Committed by GitHub Feb 18, 2025
Hide whitespace changes
Inline Side-by-side

Showing with 28 additions and 24 deletions

docker/Dockerfile.ngc.vllm
+26 -22

examples/ppo_trainer/run_gemma.sh
+2 -2

No files found.
--- a/docker/Dockerfile.ngc.vllm
+++ b/docker/Dockerfile.ngc.vllm
+# docker buildx build --platform linux/x86_64 -t "verlai/verl:ngc-th2.4.0-cu124-vllm0.6.3-ray2.4-te1.7-v0.0.6" -f docker/Dockerfile.ngc.vllm . --builder cloud-verlai-verl-builder --progress=plain --push
 FROM nvcr.io/nvidia/pytorch:24.05-py3
 # uninstall nv-pytorch fork
@@ -11,33 +12,36 @@ RUN pip3 uninstall pytorch-quantization \
 RUN pip3 install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu124
-# make sure torch version is kept
+# =============== Megatron dependencies (optional) =================
+# install apex, set MAX_JOBS to avoid OOMs
+RUN MAX_JOBS=4 pip3 install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
+    --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" \
+    git+https://github.com/NVIDIA/apex
+# =============== End of Megatron dependencies (optional) =================
 RUN pip3 install --no-cache-dir \
-    "torch==2.4.0" \
    accelerate \
    codetiming \
    datasets \
    dill \
    hydra-core \
    numpy \
-    pybind11 \
+    'pandas' \
-    tensordict \
+    'peft' \
-    "transformers<=4.46.0"
+    'pyarrow>=15.0.0' \
+    'pybind11' \
-# ray is installed via vllm
+    'pylatexenc' \
-RUN pip3 install --no-cache-dir vllm==0.6.3
+    'ray>=2.10' \
+    'tensordict<0.6' \
-# we choose flash-attn v2.7.0 or v2.7.2 which contain pre-built wheels
+    'transformers' \
-RUN pip3 install --no-cache-dir --no-build-isolation flash-attn==2.7.0.post2
+    'vllm==0.6.3.post1' \
+    'wandb'
-# install apex, set MAX_JOBS to avoid OOMs
-RUN MAX_JOBS=4 pip3 install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
+# full dependencies
-    --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" \
+RUN pip3 install pytest yapf py-spy pyext liger-kernel
-    git+https://github.com/NVIDIA/apex
+# =============== Megatron dependencies (optional) =================
-# install Transformer Engine, which requires FA 2.5.8
+# install Transformer Engine, which requires FA 2.5.8. Do it in a separate step for docker cache
 RUN MAX_JOBS=4 NINJA_FLAGS="-j4" pip3 install flash-attn==2.5.8 --no-cache-dir --no-build-isolation
-RUN MAX_JOBS=4 NINJA_FLAGS="-j4" pip3 install git+https://github.com/NVIDIA/TransformerEngine.git@v1.7
+RUN MAX_JOBS=1 NINJA_FLAGS="-j1" TE_BUILD_WITH_NINJA=0 pip3 install git+https://github.com/eric-haibin-lin/TransformerEngine.git@v1.7.0
+# =============== End of Megatron dependencies (optional) =================
-# Pin wandb to v0.18 since v0.19.1 is released with ImportError
-RUN pip3 install wandb==0.18.7 py-spy
--- a/examples/ppo_trainer/run_gemma.sh
+++ b/examples/ppo_trainer/run_gemma.sh
@@ -9,7 +9,7 @@ python3 -m verl.trainer.main_ppo \
    data.max_response_length=512 \
    actor_rollout_ref.model.path=google/gemma-2-2b-it \
    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.model.use_remove_padding=False \
    actor_rollout_ref.actor.ppo_mini_batch_size=128 \
    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
    actor_rollout_ref.actor.fsdp_config.param_offload=False \
@@ -21,7 +21,7 @@ python3 -m verl.trainer.main_ppo \
    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
    actor_rollout_ref.ref.fsdp_config.param_offload=True \
    critic.optim.lr=1e-5 \
-    critic.model.use_remove_padding=True \
+    critic.model.use_remove_padding=False \
    critic.model.path=google/gemma-2-2b-it \
    critic.model.enable_gradient_checkpointing=False \
    critic.ppo_micro_batch_size_per_gpu=4 \