Unverified Commit 77f065ea by HL Committed by GitHub

example: fix the gemma2 example, update NGC dockerfile (#291)

parent 0dfcb7f9
# docker buildx build --platform linux/x86_64 -t "verlai/verl:ngc-th2.4.0-cu124-vllm0.6.3-ray2.4-te1.7-v0.0.6" -f docker/Dockerfile.ngc.vllm . --builder cloud-verlai-verl-builder --progress=plain --push
FROM nvcr.io/nvidia/pytorch:24.05-py3
# uninstall nv-pytorch fork
......@@ -11,33 +12,36 @@ RUN pip3 uninstall pytorch-quantization \
RUN pip3 install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu124
# make sure torch version is kept
# =============== Megatron dependencies (optional) =================
# install apex, set MAX_JOBS to avoid OOMs
RUN MAX_JOBS=4 pip3 install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
--config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" \
git+https://github.com/NVIDIA/apex
# =============== End of Megatron dependencies (optional) =================
RUN pip3 install --no-cache-dir \
"torch==2.4.0" \
accelerate \
codetiming \
datasets \
dill \
hydra-core \
numpy \
pybind11 \
tensordict \
"transformers<=4.46.0"
# ray is installed via vllm
RUN pip3 install --no-cache-dir vllm==0.6.3
# we choose flash-attn v2.7.0 or v2.7.2 which contain pre-built wheels
RUN pip3 install --no-cache-dir --no-build-isolation flash-attn==2.7.0.post2
# install apex, set MAX_JOBS to avoid OOMs
RUN MAX_JOBS=4 pip3 install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
--config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" \
git+https://github.com/NVIDIA/apex
# install Transformer Engine, which requires FA 2.5.8
'pandas' \
'peft' \
'pyarrow>=15.0.0' \
'pybind11' \
'pylatexenc' \
'ray>=2.10' \
'tensordict<0.6' \
'transformers' \
'vllm==0.6.3.post1' \
'wandb'
# full dependencies
RUN pip3 install pytest yapf py-spy pyext liger-kernel
# =============== Megatron dependencies (optional) =================
# install Transformer Engine, which requires FA 2.5.8. Do it in a separate step for docker cache
RUN MAX_JOBS=4 NINJA_FLAGS="-j4" pip3 install flash-attn==2.5.8 --no-cache-dir --no-build-isolation
RUN MAX_JOBS=4 NINJA_FLAGS="-j4" pip3 install git+https://github.com/NVIDIA/TransformerEngine.git@v1.7
# Pin wandb to v0.18 since v0.19.1 is released with ImportError
RUN pip3 install wandb==0.18.7 py-spy
RUN MAX_JOBS=1 NINJA_FLAGS="-j1" TE_BUILD_WITH_NINJA=0 pip3 install git+https://github.com/eric-haibin-lin/TransformerEngine.git@v1.7.0
# =============== End of Megatron dependencies (optional) =================
......@@ -9,7 +9,7 @@ python3 -m verl.trainer.main_ppo \
data.max_response_length=512 \
actor_rollout_ref.model.path=google/gemma-2-2b-it \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.model.use_remove_padding=False \
actor_rollout_ref.actor.ppo_mini_batch_size=128 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
......@@ -21,7 +21,7 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.model.use_remove_padding=False \
critic.model.path=google/gemma-2-2b-it \
critic.model.enable_gradient_checkpointing=False \
critic.ppo_micro_batch_size_per_gpu=4 \
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment