Commit 6432075e by root

update LLM generation with tools in verl

parent 560f8208
......@@ -13,6 +13,10 @@
*.slurm*
/*.sh*
**/tmp/
/data/*
/ret_one/*
/testbench/*
# Byte-compiled / optimized / DLL files
__pycache__/
......
......@@ -5,7 +5,10 @@
### 数据预处理
脚本是`examples/data_preprocess/codev.py``scripts/preprocess.sh`里面记录了对它的调用,包括原始数据和处理后数据的路径。
## 例子
### 核心机制
verl v0.4.0里面工具调用rollout的核心机制在`verl/workers/rollout/sglang_rollout/sglang_rollout.py``_async_rollout_a_request`函数和`verl/workers/rollout/schemas.py``AsyncRolloutRequest`类里面。
## 例子(RL)
下面是一个ereus2数据集的例子,配置文件是`examples/tir/run_tir.sh`
......@@ -224,8 +227,7 @@ builtins.print = traced_print
#### Prompt过滤长度没加tool导致后续prompt长度超了报错
verl在`verl/utils/dataset/rl_dataset.py``_read_files_and_tokenize`函数里面初筛prompt长度的时候没有把tool对应的prompt给算上,导致prompt长度估计短了,在后续运行的时候会因为长度超了报错。
verl在`verl/utils/dataset/rl_dataset.py``_read_files_and_tokenize`函数里面初筛prompt长度的时候没有把tool对应的prompt给算上,导致prompt长度估计短了,在后续运行的时候会因为长度超了报错。对此我在预处理的时候把长度筛选给加回来了(`examples/data_preprocess/codev.py`)。
#### 最后贴一个(应该是)正常的初始测试集准确率
......@@ -253,4 +255,8 @@ verl在`verl/utils/dataset/rl_dataset.py`的`_read_files_and_tokenize`函数里
"'val-core/taco/reward/best@2/mean': 0.0, 'val-core/taco/reward/best@2/std': "
"0.0, 'val-aux/taco/reward/worst@2/mean': 0.0, "
"'val-aux/taco/reward/worst@2/std': 0.0}")
```
\ No newline at end of file
```
## 例子(Inference)
verl目前的`verl/trainer/main_generation.py`不支持工具调用,我复制了一些RL训练的代码把它改的支持工具调用了,文件在`verl/trainer/custom_generation.py`,调用脚本在`generate_test.sh`
此外还对`verl/trainer/config/generation.yaml`进行了一些小修改,适配了一点配置。
\ No newline at end of file
Bootstrap: docker
# Support - Traing: fsdp; Inference: vllm
# FROM: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
# Support - Traing: fsdp; Inference: vllm, sglang
FROM lmsysorg/sglang:v0.4.5-rocm630
%environment
export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
export HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__"
export CFLAGS="-D__HIP_PLATFORM_AMD__"
export CXXFLAGS="-D__HIP_PLATFORM_AMD__"
%post
# Create source directory
mkdir -p /opt/src
# Uninstall and reinstall vllm
pip uninstall -y vllm
cd /opt/src
git clone -b v0.6.3 https://github.com/vllm-project/vllm.git
cd vllm
MAX_JOBS=$(nproc) python3 setup.py install
cd /opt
rm -rf /opt/src/vllm
# Install dependencies
pip install "tensordict<0.6" --no-deps
pip install accelerate \
codetiming \
datasets \
dill \
hydra-core \
liger-kernel \
numpy \
pandas \
peft \
"pyarrow>=15.0.0" \
pylatexenc \
"ray[data,train,tune,serve]" \
torchdata \
transformers \
wandb \
orjson \
pybind11
# Clone and install verl from GitHub
cd /opt
git clone https://github.com/volcengine/verl.git
cd verl
# Uncomment to use a specific version
# git checkout v0.3.0.post0
pip install -e . --no-deps
# Install torch_memory_saver
pip install git+https://github.com/ExtremeViscent/torch_memory_saver.git --no-deps
\ No newline at end of file
FROM whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6-mcore0.12.0-te2.3
# For aws instances with EFA net interface (Sagemaker AI Pod)
# install EFA driver:
######## AWS EFA ############
ENV NCCL_VERSION=2.25.1-1
ENV DEBIAN_FRONTEND=noninteractive
ENV EFA_INSTALLER_VERSION=1.40.0
ENV AWS_OFI_NCCL_VERSION=1.14.2
ENV FI_EFA_SET_CUDA_SYNC_MEMOPS=0
ENV FI_PROVIDER=efa
RUN apt update && apt install -y linux-image-generic libhwloc-dev
RUN cd /tmp && \
curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \
tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \
cd aws-efa-installer && \
./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify && \
ldconfig && \
rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/*
# NCCL EFA Plugin
RUN cd /tmp && \
curl -LO https://github.com/aws/aws-ofi-nccl/archive/refs/tags/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
tar -xzf /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
rm /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
mv aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} aws-ofi-nccl && \
cd /tmp/aws-ofi-nccl && \
./autogen.sh && \
./configure --prefix=/opt/amazon/efa \
--with-libfabric=/opt/amazon/efa \
--with-cuda=/usr/local/cuda \
--enable-platform-aws \
--with-mpi=/opt/amazon/openmpi && \
make -j$(nproc) install && \
rm -rf /tmp/aws-ofi/nccl
# NCCL
RUN echo "/usr/local/lib" >> /etc/ld.so.conf.d/local.conf && \
echo "/opt/amazon/openmpi/lib" >> /etc/ld.so.conf.d/efa.conf && \
ldconfig
ENV OMPI_MCA_pml=^cm,ucx \
OMPI_MCA_btl=tcp,self \
OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent \
OPAL_PREFIX=/opt/amazon/openmpi \
NCCL_SOCKET_IFNAME=^docker,lo,veth_def_agent \
FI_EFA_USE_HUGE_PAGE=0
# docker build -t whatcanyousee/verl:awsefa --label "commit=$(git rev-parse --short HEAD)" .
# on aws:
# docker run --ipc=host --privileged --name verldev --gpus all --network=host --shm-size=1800gb -itd whatcanyousee/verl:awsefa
FROM verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3
RUN pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
RUN cd /opt/nvidia && git clone --single-branch --branch core_r0.11.0 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
# only config pip index with https://pypi.tuna.tsinghua.edu.cn/simple if needed
# unset for now
RUN cd /opt/nvidia/Megatron-LM && pip3 install --no-deps -e .
\ No newline at end of file
# docker buildx build --platform linux/x86_64 -t "verlai/verl:ngc-th2.4.0-cu124-vllm0.6.3-ray2.4-te1.7-v0.0.6" -f docker/Dockerfile.ngc.vllm . --builder cloud-verlai-verl-builder --progress=plain --push
FROM nvcr.io/nvidia/pytorch:24.05-py3
# uninstall nv-pytorch fork
RUN pip3 uninstall pytorch-quantization \
pytorch-triton \
torch \
torch-tensorrt \
torchvision \
xgboost transformer_engine flash_attn \
apex megatron-core -y
RUN pip3 install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu124
# =============== Megatron dependencies (optional) =================
# install apex, set MAX_JOBS to avoid OOMs
RUN MAX_JOBS=4 pip3 install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
--config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" \
git+https://github.com/NVIDIA/apex
# =============== End of Megatron dependencies (optional) =================
RUN pip3 install --no-cache-dir \
accelerate \
codetiming \
datasets \
dill \
hydra-core \
numpy \
'pandas' \
'peft' \
'pyarrow>=15.0.0' \
'pybind11' \
'pylatexenc' \
'ray>=2.10' \
'tensordict<0.6' \
'transformers' \
'vllm==0.6.3.post1' \
'wandb'
# full dependencies
RUN pip3 install pytest pre-commit py-spy pyext liger-kernel
# =============== Megatron dependencies (optional) =================
# install Transformer Engine, which requires FA 2.5.8. Do it in a separate step for docker cache
RUN MAX_JOBS=4 NINJA_FLAGS="-j4" pip3 install flash-attn==2.5.8 --no-cache-dir --no-build-isolation
RUN MAX_JOBS=1 NINJA_FLAGS="-j1" TE_BUILD_WITH_NINJA=0 pip3 install git+https://github.com/eric-haibin-lin/TransformerEngine.git@v1.7.0
# =============== End of Megatron dependencies (optional) =================
# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
FROM nvcr.io/nvidia/pytorch:24.08-py3
# Define environments
ENV MAX_JOBS=32
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Define installation arguments
ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
# Set apt source
RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
{ \
echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
} > /etc/apt/sources.list
# Install systemctl
RUN apt-get update && \
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
apt-get clean
# Install tini
RUN apt-get update && \
apt-get install -y tini && \
apt-get clean
# Change pip source
RUN pip config set global.index-url "${PIP_INDEX}" && \
pip config set global.extra-index-url "${PIP_INDEX}" && \
python -m pip install --upgrade pip
# Uninstall nv-pytorch fork
RUN pip uninstall -y torch torchvision torchaudio \
pytorch-quantization pytorch-triton torch-tensorrt \
xgboost transformer_engine flash_attn apex megatron-core grpcio
# Install torch-2.6.0+cu124 + vllm-0.8.3
# torch-2.6.0+cu124: cxx11abi=False
# torch-2.6.0+cu126: cxx11abi=True
# see https://github.com/flashinfer-ai/flashinfer/issues/911
RUN pip install --no-cache-dir "vllm==0.8.3" "torch==2.6.0" "torchvision==0.21.0" "torchaudio==2.6.0" "tensordict==0.6.2" torchdata \
"transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=15.0.0" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler \
pytest py-spy pyext pre-commit ruff
# Install flash-attn-2.7.4.post1 (cxx11abi=False)
RUN wget -nv https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
# Install flashinfer-0.2.2.post1+cu124 (cxx11abi=False)
# vllm-0.8.3 does not support flashinfer>=0.2.3
# see https://github.com/vllm-project/vllm/pull/15777
RUN wget -nv https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.2.post1/flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \
pip install --no-cache-dir flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl
# Fix packages
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
# Install verl
RUN pip install --no-cache-dir verl[vllm] -U
# Reset pip config
RUN pip config unset global.index-url && \
pip config unset global.extra-index-url
# Using a pre-built image from AWS DLC which contains the current version of python (3.10) and supported cuda version (12.1)
FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:2.1.0-transformers4.36.0-gpu-py310-cu121-ubuntu20.04
# uninstall nv-pytorch fork
RUN pip3 uninstall -y pytorch-quantization \
pytorch-triton torch torch-tensorrt torchvision \
xgboost transformer_engine flash_attn apex megatron-core
# Define environments
ENV MAX_JOBS=32
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Install systemctl
RUN apt-get update && \
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
apt-get clean
# Install tini
RUN apt-get update && \
apt-get install -y tini && \
apt-get clean
# Install torch-2.6.0 + vllm-0.8.2
RUN pip install --no-cache-dir vllm==0.8.2 torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 tensordict torchdata==0.11.0 \
transformers>=4.49.0 accelerate datasets peft hf-transfer \
ray[default] codetiming hydra-core pandas pyarrow>=15.0.0 pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler \
pytest pre-commit py-spy pyext ruff
# Install flash_attn-2.7.4.post1
RUN pip uninstall -y transformer-engine flash-attn && \
pip install flash-attn==2.7.4.post1 --no-build-isolation
# Fix cv2
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --no-cache-dir nvidia-ml-py>=12.560.30 opencv-python-headless==4.8.0.74 fastapi==0.115.6 && \
pip install --no-cache-dir --upgrade optree>=0.13.0
# Install verl
RUN pip install --no-cache-dir verl[vllm] -U
# Reset pip config
RUN pip config unset global.index-url && \
pip config unset global.extra-index-url
# Build the docker in the repo dir:
# docker build -f docker/Dockerfile.rocm -t verl-rocm:03.04.2015 .
# docker images # you can find your built docker
# Support - Traing: fsdp; Inference: vllm
# FROM rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
# Support - Traing: fsdp; Inference: vllm, sglang
FROM lmsysorg/sglang:v0.4.6.post5-rocm630
# Set working directory
# WORKDIR $PWD/app
# Set environment variables
ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
ENV HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__"
ENV CFLAGS="-D__HIP_PLATFORM_AMD__"
ENV CXXFLAGS="-D__HIP_PLATFORM_AMD__"
# Install vllm
RUN pip uninstall -y vllm && \
rm -rf vllm && \
git clone -b v0.6.3 https://github.com/vllm-project/vllm.git && \
cd vllm && \
MAX_JOBS=$(nproc) python3 setup.py install && \
cd .. && \
rm -rf vllm
# Copy the entire project directory
COPY . .
# Install dependencies
RUN pip install "tensordict==0.6.2" --no-deps && \
pip install accelerate \
codetiming \
datasets \
dill \
hydra-core \
liger-kernel \
numpy \
pandas \
peft \
"pyarrow>=15.0.0" \
pylatexenc \
"ray[data,train,tune,serve]<2.45.0" \
torchdata \
transformers \
wandb \
orjson \
pybind11 && \
pip install -e . --no-deps
# Install torch_memory_saver
RUN pip install git+https://github.com/ExtremeViscent/torch_memory_saver.git --no-deps
# Start from the NVIDIA official image (ubuntu-22.04 + python-3.10)
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
FROM nvcr.io/nvidia/pytorch:24.08-py3
# Define environments
ENV MAX_JOBS=32
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
# Define installation arguments
ARG APT_SOURCE=https://mirrors.ustc.edu.cn/ubuntu/
# Set apt source
RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
{ \
echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
} > /etc/apt/sources.list
# Install systemctl
RUN apt-get update && \
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
apt-get clean
# Install tini
RUN apt-get update && \
apt-get install -y tini && \
apt-get clean
# Change pip source
ARG PIP_INDEX=https://mirrors.aliyun.com/pypi/simple/
RUN pip config set global.index-url "${PIP_INDEX}" && \
pip config set global.extra-index-url "${PIP_INDEX}" && \
python -m pip install --upgrade pip
# Install sglang-0.4.6.post5 and torch-memory-saver
RUN pip uninstall -y cuda-python && pip install "sglang[all]==0.4.6.post5" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
# Install torch-2.6.0
RUN pip install --no-cache-dir torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 tensordict torchdata \
transformers>=4.49.0 accelerate datasets peft hf_transfer \
ray[default] codetiming hydra-core pandas pyarrow>=15.0.0 pylatexenc qwen-vl-utils wandb liger-kernel \
pytest pre-commit py-spy pyext
# Install flash_attn-2.7.4.post1
RUN pip uninstall -y transformer-engine flash-attn && \
wget -v https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
# Fix cv2
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --no-cache-dir nvidia-ml-py>=12.560.30 opencv-python-headless==4.8.0.74 fastapi==0.115.6
# docker buildx build --platform linux/x86_64 -t "verlai/verl:$TAG" -f docker/$FILE .
# the one in docker.io is an alias for the one veturbo
# FROM vemlp-cn-beijing.cr.volces.com/veturbo/pytorch:2.4-cu124
FROM docker.io/haibinlin/verl:v0.0.5-th2.4.0-cu124-base
# only config pip index with https://pypi.tuna.tsinghua.edu.cn/simple if needed
# unset for now
RUN pip3 config unset global.index-url
# transformers 4.47.0 contains the following bug:
# AttributeError: 'Gemma2Attention' object has no attribute '_flash_attn_uses_top_left_mask'
RUN pip3 install --no-cache-dir \
torch==2.4.0 \
accelerate \
codetiming \
dill \
hydra-core \
numpy \
pybind11 \
tensordict \
"transformers <= 4.46.0"
RUN pip3 install --no-cache-dir flash-attn==2.7.0.post2 --no-build-isolation
# vllm depends on ray
RUN pip3 install --no-cache-dir vllm==0.6.3 ray==2.10
# install apex
RUN MAX_JOBS=4 pip3 install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
--config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" \
git+https://github.com/NVIDIA/apex
# install Transformer Engine
# - flash-attn pinned to 2.5.3 by TransformerEngine, switch to eric-haibin-lin/TransformerEngine.git@v1.7.0 to relax version req
# - install with: MAX_JOBS=1 NINJA_FLAGS="-j1" TE_BUILD_WITH_NINJA=0 to avoid OOM
# - cudnn is required by TransformerEngine
# RUN CUDNN_PATH=/opt/conda/lib/python3.11/site-packages/nvidia/cudnn \
# pip3 install git+https://github.com/eric-haibin-lin/TransformerEngine.git@v1.7.0
RUN MAX_JOBS=1 NINJA_FLAGS="-j1" pip3 install flash-attn==2.5.3 --no-cache-dir --no-build-isolation
RUN MAX_JOBS=1 NINJA_FLAGS="-j1" pip3 install git+https://github.com/NVIDIA/TransformerEngine.git@v1.7
# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
FROM nvcr.io/nvidia/pytorch:24.08-py3
# Define environments
ENV MAX_JOBS=32
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Define installation arguments
ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
# Set apt source
RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
{ \
echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
} > /etc/apt/sources.list
# Install systemctl
RUN apt-get update && \
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
apt-get clean
# Install tini
RUN apt-get update && \
apt-get install -y tini aria2 && \
apt-get clean
# Change pip source
RUN pip config set global.index-url "${PIP_INDEX}" && \
pip config set global.extra-index-url "${PIP_INDEX}" && \
python -m pip install --upgrade pip
# Uninstall nv-pytorch fork
RUN pip uninstall -y torch torchvision torchaudio \
pytorch-quantization pytorch-triton torch-tensorrt \
xgboost transformer_engine flash_attn apex megatron-core grpcio
# Reinstall CUDA 12.4
RUN aria2c https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600
RUN aria2c --always-resume=true --max-tries=99999 https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
dpkg -i cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
cp /var/cuda-repo-ubuntu2204-12-4-local/cuda-*-keyring.gpg /usr/share/keyrings/ && \
apt-get update && \
apt-get -y install cuda-toolkit-12-4 && \
rm cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
update-alternatives --set cuda /usr/local/cuda-12.4 && \
rm -rf /usr/local/cuda-12.6
# Install torch-2.6.0+cu124 + vllm-0.8.5.post1 + sglang-0.4.6.post5
# torch-2.6.0+cu124: cxx11abi=False
# torch-2.6.0+cu126: cxx11abi=True
# see https://github.com/flashinfer-ai/flashinfer/issues/911
# Install sglang-0.4.6.post1 and torch-memory-saver
RUN pip install "sglang[all]==0.4.6.post5" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
RUN pip install --no-cache-dir "vllm==0.8.5.post1" "torch==2.6.0" "torchvision==0.21.0" "torchaudio==2.6.0" "tensordict==0.6.2" torchdata
RUN pip install --no-cache-dir "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=15.0.0" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile \
pytest py-spy pyext pre-commit ruff
# Install flash-attn-2.7.4.post1 (cxx11abi=False)
RUN wget -nv https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
# Fix packages
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
# Install cudnn
RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/ && \
apt-get update && \
apt-get -y install cudnn-cuda-12 && \
rm cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb
RUN pip install --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
# Install Apex
RUN git clone https://github.com/NVIDIA/apex.git && \
cd apex && \
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --no-deps --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@v2.3
# Install Megatron-LM
RUN pip3 install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.0
# Fix opencv
RUN pip install opencv-python
RUN pip install opencv-fixer && \
python -c "from opencv_fixer import AutoFix; AutoFix()"
# Install verl
# Reset pip config
RUN pip config unset global.index-url && \
pip config unset global.extra-index-url
RUN apt-get update && \
apt-get install -y aria2 libfreeimage3 libfreeimage-dev zlib1g
\ No newline at end of file
# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
FROM nvcr.io/nvidia/pytorch:24.08-py3
# Define environments
ENV MAX_JOBS=32
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV PIP_ROOT_USER_ACTION=ignore
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Define installation arguments
ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
# Set apt source
RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
{ \
echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
} > /etc/apt/sources.list
# Install systemctl
RUN apt-get update && \
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
apt-get clean
# Install tini
RUN apt-get update && \
apt-get install -y tini aria2 && \
apt-get clean
# Change pip source
RUN pip config set global.index-url "${PIP_INDEX}" && \
pip config set global.extra-index-url "${PIP_INDEX}" && \
python -m pip install --upgrade pip
# Uninstall nv-pytorch fork
RUN pip uninstall -y torch torchvision torchaudio \
pytorch-quantization pytorch-triton torch-tensorrt \
xgboost transformer_engine flash_attn apex megatron-core grpcio
# Reinstall CUDA 12.4
RUN aria2c https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600
RUN aria2c --always-resume=true --max-tries=99999 https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
dpkg -i cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
cp /var/cuda-repo-ubuntu2204-12-4-local/cuda-*-keyring.gpg /usr/share/keyrings/ && \
apt-get update && \
apt-get -y install cuda-toolkit-12-4 && \
rm cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
update-alternatives --set cuda /usr/local/cuda-12.4 && \
rm -rf /usr/local/cuda-12.6
# Install torch-2.6.0+cu124 + vllm-0.8.5.post1 + sglang-0.4.6.post5
# torch-2.6.0+cu124: cxx11abi=False
# torch-2.6.0+cu126: cxx11abi=True
# see https://github.com/flashinfer-ai/flashinfer/issues/911
# Install sglang-0.4.6.post1 and torch-memory-saver
RUN pip install --resume-retries 999 "sglang[all]==0.4.6.post5" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install --resume-retries 999 torch-memory-saver --no-cache-dir
RUN pip install --resume-retries 999 --no-cache-dir "vllm==0.8.5.post1" "torch==2.6.0" "torchvision==0.21.0" "torchaudio==2.6.0" "tensordict==0.6.2" torchdata
RUN pip install --resume-retries 999 --no-cache-dir "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
"numpy<2.0.0" "pyarrow>=15.0.0" pandas \
ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile \
pytest py-spy pyext pre-commit ruff
# Install flash-attn-2.7.4.post1 (cxx11abi=False)
RUN wget -nv https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
# Fix packages
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
# Install cudnn
RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/ && \
apt-get update && \
apt-get -y install cudnn-cuda-12 && \
rm cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb
RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
# Install Apex
RUN git clone https://github.com/NVIDIA/apex.git && \
cd apex && \
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
# Install TransformerEngine
RUN export NVTE_FRAMEWORK=pytorch && pip3 install --no-deps --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@v2.3
# Install Megatron-LM
RUN pip3 install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.1
# Fix opencv
RUN pip install opencv-python
RUN pip install opencv-fixer && \
python -c "from opencv_fixer import AutoFix; AutoFix()"
# Install verl
# Reset pip config
RUN pip config unset global.index-url && \
pip config unset global.extra-index-url
RUN apt-get update && \
apt-get install -y aria2 libfreeimage3 libfreeimage-dev zlib1g
\ No newline at end of file
# Start from the NVIDIA official image (ubuntu-22.04 + python-3.10)
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
FROM nvcr.io/nvidia/pytorch:24.08-py3
# uninstall nv-pytorch fork
RUN pip3 uninstall -y pytorch-quantization \
pytorch-triton torch torch-tensorrt torchvision \
xgboost transformer_engine flash_attn apex megatron-core
# Define environments
ENV MAX_JOBS=32
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
ENV DEBIAN_FRONTEND=noninteractive
ENV NODE_OPTIONS=""
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
# Define installation arguments
ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
# Set apt source
RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
{ \
echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
} > /etc/apt/sources.list
# Install systemctl
RUN apt-get update && \
apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
apt-get clean
# Install tini
RUN apt-get update && \
apt-get install -y tini && \
apt-get clean
# Change pip source
RUN pip config set global.index-url "${PIP_INDEX}" && \
pip config set global.extra-index-url "${PIP_INDEX}" && \
python -m pip install --upgrade pip
# Install torch-2.6.0 + vllm-0.8.1
RUN pip install --no-cache-dir vllm==0.8.1 torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 tensordict torchdata \
transformers>=4.49.0 accelerate datasets peft hf-transfer \
ray codetiming hydra-core pandas pyarrow>=15.0.0 pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler \
pytest yapf py-spy pyext pre-commit ruff
# Install flash_attn-2.7.4.post1
RUN pip uninstall -y transformer-engine flash-attn && \
wget -nv https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
# Fix cv2
RUN pip uninstall -y pynvml nvidia-ml-py && \
pip install --no-cache-dir nvidia-ml-py>=12.560.30 opencv-python-headless==4.8.0.74 fastapi==0.115.6 && \
pip install -U optree>=0.13.0
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SPHINXPROJ = verl
SOURCEDIR = .
BUILDDIR = _build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
# verl documentations
## Build the docs
```bash
# Install dependencies.
pip install -r requirements-docs.txt
# Build the docs.
make clean
make html
```
## Open the docs with your browser
```bash
python -m http.server -d _build/html/
```
Launch your browser and navigate to http://localhost:8000 to view the documentation.
# Upgrading to vllm >= 0.7
Note: verl+vllm 0.8.3 is now stable. Please see ``docs/README_vllm0.8.md`` for upgrade guide.
## Installation
Note: At time of writing, verl+vllm 0.7.x supports **FSDP** for training and **vLLM** for rollout.
```
# Create the conda environment
conda create -n verl python==3.10
conda activate verl
# Install verl
git clone https://github.com/volcengine/verl.git
cd verl
pip3 install -e .
# Install the latest stable version of vLLM
pip3 install vllm==0.7.3
# Install flash-attn
pip3 install flash-attn --no-build-isolation
```
Note that if you are installing lower versions of vLLM (0.7.0, 0.7.1, 0.7.2), you need to make some tiny patches manually on vllm (/path/to/site-packages/vllm after installation) after the above steps:
- vllm/distributed/parallel_state.py: Remove the assertion below:
```
if (world_size
!= tensor_model_parallel_size * pipeline_model_parallel_size):
raise RuntimeError(
f"world_size ({world_size}) is not equal to "
f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
f"pipeline_model_parallel_size ({pipeline_model_parallel_size})")
```
- vllm/executor/uniproc_executor.py: change `local_rank = rank` to `local_rank = int(os.environ["LOCAL_RANK"])`
- vllm/model_executor/model_loader/weight_utils.py: remove the `torch.cuda.empty_cache()` in `pt_weights_iterator`
## Features
### Use cuda graph
After installation, examples using FSDP as training backends can be used. By default, the `enforce_eager` is set to True, which disables the cuda graph. To enjoy cuda graphs and the sleep mode of vLLM>=0.7, add the following lines to the bash script:
```
actor_rollout_ref.rollout.enforce_eager=False \
actor_rollout_ref.rollout.free_cache_engine=False \
```
For a typical job like examples/ppo_trainer/run_qwen2-7b_seq_balance.sh, the rollout generation time is 115 seconds with vLLM0.6.3, while it is 85 seconds with vLLM0.7.0. By enabling the cudagraph, the generation duration is further reduced to 62 seconds.
**Note:** Currently, if the `n` is greater than 1 in `SamplingParams` in vLLM>=0.7, there is a potential performance issue on the stability of rollout generation time (Some iterations would see generation time bursts) using vLLM's V0 Engine.
### Use vLLM V1 Engine
Using the vLLM V1 engine can avoid instability issues and achieve additional performance improvements. To use the V1 engine, you can first uninstall the previously installed vLLM and then follow the steps below to install the newer version.
```
git clone https://github.com/vllm-project/vllm.git
cd vllm
git checkout 2275784
sed -i "903a\ data_parallel_size = world_size // pipeline_model_parallel_size // tensor_model_parallel_size" ./vllm/distributed/parallel_state.py
VLLM_USE_PRECOMPILED=1 pip install --editable .
```
Then you can enable the V1 engine by setting `export VLLM_USE_V1=1`. In some benchmark tests, the V1 engine demonstrates a 1.5x speed improvement over the vLLM V0 engine.
The stable support of the vLLM V1 engine is available on verl main.
# Upgrading to vLLM >= 0.8
## Installation
Note: This version of verl+vLLM 0.8+ supports **FSDP** for training and **vLLM** for rollout.
```bash
# Create the conda environment
conda create -n verl python==3.10
conda activate verl
# Install verl
git clone https://github.com/volcengine/verl.git
cd verl
pip3 install -e .
# Install the latest stable version of vLLM
pip3 install vllm==0.8.3
# Install flash-attn
pip3 install flash-attn --no-build-isolation
```
We have a pre-built docker image for verl+vLLM 0.8.3. You can direct import it with the following command:
```bash
docker pull hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.3-flashinfer0.2.2-cxx11abi0
```
## Features
vLLM 0.8+ supports cuda graph and V1 engine by default in verl. To enable these features, remember to add the following lines to the bash script:
```bash
actor_rollout_ref.rollout.enforce_eager=False \
actor_rollout_ref.rollout.free_cache_engine=False \
```
and also **remove** the environment variable if it exists:
```bash
# If you are using vllm<=0.6.3, you might need to set the following environment variable to avoid bugs:
# export VLLM_ATTENTION_BACKEND=XFORMERS
```
## Notes
When you just directly upgrade vllm>=0.8, some dependency packages may undergo version changes. If you encounter the following problems:
```bash
in <module> from torch.multiprocessing.reductions import ForkingPickler ImportError: cannot import name 'ForkingPickler' from 'torch.multiprocessing.reductions' (/opt/conda/lib/python3.11/site-packages/torch/multiprocessing/reductions.py)
```
You need to upgrade `tensordict` to version 0.6.2 using the command `pip install tensordict==0.6.2`.
document.addEventListener("DOMContentLoaded", function () {
var script = document.createElement("script");
script.type = "module";
script.id = "runllm-widget-script";
script.src = "https://widget.runllm.com";
script.setAttribute("version", "stable");
script.setAttribute("crossorigin", "true");
script.setAttribute("runllm-keyboard-shortcut", "Mod+j");
script.setAttribute("runllm-name", "verl Chatbot");
script.setAttribute("runllm-position", "TOP_RIGHT");
script.setAttribute("runllm-assistant-id", "679");
script.async = true;
document.head.appendChild(script);
});
\ No newline at end of file
Using Checkpoints to Support Fault Tolerance Training
=====================================================
There could be training errors or machine failure during the whole RLHF training process,
so it is recommended to enable checkpoints to minimize your loss.
The API Interface has already been listed in :ref:`config-explain-page`,
and we will not repeat them. But there are still some technique details
we hope to clarify.
.. note::
Notice that the ``checkpoint.contents`` field has no effect to FSDP checkpoint except ``hf_model``,
the other 3 fields are binded together to save and load. We recommend to include ``model``, ``optimizer`` and ``extra`` all.
Checkpoint Saving Directory Structure
-------------------------------------
Commonly, we use the ``default_local_dir`` declared in ``ppo_trainer.yaml`` or ``ppo_megatron_trainer.yml``
to work as preffix when saving checkpoints, which is ``checkpoints/${trainer.project_name}/${trainer.experiment_name}``.
So the inner checkpoint structure of **FSDP** is like:
.. code::
checkpoints/${trainer.project_name}/${trainer.experiment_name}
├── global_steps_${i}
│ ├── actor
│ │ ├── model_world_size_{self.world_size}_rank_{self.rank}.pt
│ │ ├── optim_world_size_{self.world_size}_rank_{self.rank}.pt
│ │ └── extra_state_world_size_{self.world_size}_rank_{self.rank}.pt
│ ├── actor_huggingface
│ ├── critic
│ │ ├── model_world_size_{self.world_size}_rank_{self.rank}.pt
│ │ ├── optim_world_size_{self.world_size}_rank_{self.rank}.pt
│ │ └── extra_state_world_size_{self.world_size}_rank_{self.rank}.pt
│ └── critic_huggingface
└── latest_checkpointed_iteration.txt
All model shards, optimizers and extra states are stored together, in a sharded and distributed way.
While **Megatron** current checkpoint structure is:
.. code::
checkpoints/${trainer.project_name}/${trainer.experiment_name}
├── global_steps_${i}
│ ├── actor
│ │ ├── huggingface # default save tokenizer, save huggingface model if include ``hf_mode`` in checkpoint.contents
│ │ ├── model # save sharded model, naming the same as Megatron
│ │ │ ├── mp_rank_xx_yyy # xx is tp_rank in 2 digits, yyy is pp_rank in 3 digits
│ │ │ │ └── model_states.pt
│ │ │ └── mp_rank_xx_xxx
│ │ ├── optim
│ │ │ └── distrib_optim_pp{a}_tp{b}_cp{c}_dp{d}.pt
│ │ └── rng_states
│ └── critic
│ │ ├── huggingface
│ │ ├── model
│ │ ├── optim
│ │ └── rng_states
└── latest_checkpointed_iteration.txt
Convert FSDP and Megatron Checkpoints to HuggingFace Format Model
-----------------------------------------------------------------
We provide a tool to convert the FSDP and Megatron checkpoints to HuggingFace format model.
The tool is located in ``scripts/model_merger.py``.
The script supports two main sub-commands: `merge` (to convert and save checkpoints) and `test` (to validate merged checkpoints against a reference model).
The arguments for the `merge` sub-command are as follows:
.. code:: bash
usage: model_merger.py merge [-h] --backend {fsdp,megatron} --local_dir LOCAL_DIR [--hf_model_path HF_MODEL_PATH]
[--tie-word-embedding] [--is-value-model] [--target_dir TARGET_DIR]
[--hf_upload_path HF_UPLOAD_PATH] [--private]
options:
-h, --help show this help message and exit
--backend {fsdp,megatron}
The backend of the model
--local_dir LOCAL_DIR
Path to the saved model checkpoints
--hf_model_path HF_MODEL_PATH
(Deprecated) Path to the original Hugging Face model for config.
--tie-word-embedding Whether to tie word embedding weights (currently only Megatron supported)
--is-value-model Whether the model is a value model (currently only Megatron supported)
--target_dir TARGET_DIR
Directory to save the merged huggingface model
--hf_upload_path HF_UPLOAD_PATH
Hugging Face repository ID to upload the model
--private Whether to upload the model to a private Hugging Face repository
Example usage for merging Megatron checkpoints:
.. code:: bash
python scripts/model_merger.py merge \
--backend megatron \
--tie-word-embedding \
--local_dir checkpoints/verl_megatron_gsm8k_examples/qwen2_5_0b5_megatron_saveload/global_step_1/actor \
--target_dir /path/to/merged_hf_model
Example usage for merging FSDP checkpoints:
.. code:: bash
python scripts/model_merger.py merge \
--backend fsdp \
--local_dir checkpoints/verl_fsdp_gsm8k_examples/qwen2_5_0b5_fsdp_saveload/global_step_1/actor \
--target_dir /path/to/merged_hf_model
Megatron Merger details
-----------------------
Current implement of decoder layers uses ``nn.ModuleList`` to store the layers,
and thus the model layers on every PP rank and VPP rank starts their index from 0.
There are 3 ways to correct this behavior:
1. Modify the decoder layer's state_dict, add ``offset`` to each layer's index, thus rewrite ``nn.ModuleList`` implementation.
2. Modify the layer index when saving checkpoint and recover them when loading checkpoint.
3. The Checkpoint merger do this work, calculate the actual ``offset`` from ``state_dict`` only, a little complex.
Current implementation use solution 2.
HuggingFace to Megatron DistCheckpoint details
----------------------------------------------
If your model is quite huge, we recommend you to use Megatron dist-checkpoint to load the model.
Megatron dist-checkpoint supports loading with different kinds of model parallelism,
and it is much faster than the original checkpoint loading.
To convert original HuggingFace model to Megatron dist-checkpoint,
you can use the ``scripts/converter_hf_to_mcore.py`` script. Large MoE models are temporarily supported with CPU initialization,
which is a little slower. While we are working on a better solution to support large models.
Example command to convert the model is as follows:
.. code:: bash
python scripts/converter_hf_to_mcore.py \
--hf_model_path Qwen/Qwen1.5-MoE-A2.7B-Chat \
--output_path /mnt/disk/Qwen/Qwen1.5-MoE-A2.7B-Chat \
--use_cpu_initialization # Only work for MoE models
Original Checkpoint Utils
-------------------------
Original Checkpoint Utils refer to original checkpoint implementation in ``verl/models/[model]/megatron/checkpoint_utils``.
We only need ``[model]_loader.py`` in original checkpoint utils now, since we get rid of storing ``hf_model`` every time (which is not recommended for large model training, try only saving sharded models if you can).
.. note::
Note that ``[model]_loader`` only support environments where **storage clusters are able to connect with every calculation nodes**.
Because it utilizes **sharded load way to minimize the loading checkpoint overhead**.
Every rank loads its own data from ``state_dict`` which can be accessed by all of them.
While there is also no need to broadcast among DP ranks, since the saved state_dict is only produced by DP rank 0.
For users who can **only place the huggingface model on one device**, we keep the original costly implementation in ``[model]_loader_deprecated``. In this implementation, rank 0 broadcast all weights to each tp and pp rank, and then dp rank 0 broadcast to all dp ranks. There may be at risks of OOM.
To use deprecated loader, change the import package of ``load_state_dict_to_megatron_llama``.
Extend to other RL(HF) algorithms
=================================
We already implemented the complete training pipeline of the PPO
algorithms. To extend to other algorithms, we analyze the high-level
principle to use verl and provide a tutorial to implement the DPO
algorithm. Users can follow the similar paradigm to extend to other RL algorithms.
.. note:: **Key ideas**: Single process drives multi-process computation and data communication.
Overall Approach
----------------
Step 1: Consider what multi-machine multi-GPU computations are needed
for each model, such as ``generate_sequence`` , ``compute_log_prob`` and
``update_policy`` in the actor_rollout model. Implement distributed
single-process-multiple-data (SPMD) computation and encapsulate them
into APIs
Step 2: Based on different distributed scenarios, including FSDP and 3D
parallelism in Megatron-LM, implement single-process control of data
interaction among multi-process computations.
Step 3: Utilize the encapsulated APIs to implement the control flow
Example: Online DPO
-------------------
We use verl to implement a simple online DPO algorithm. The algorithm
flow of Online DPO is as follows:
1. There is a prompt (rollout) generator which has the same weight as
the actor model. After a batch of prompts are fed into the generator,
it generates N responses for each prompt.
2. Send all the prompts + responses to a verifier for scoring, which can
be reward model or a rule-based function. Then sort them in pairs to
form a training batch.
3. Use this training batch to train the actor model using DPO. During
the process, a reference policy is needed.
Step 1: What are the multi-machine multi-GPU computations
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
**Sample Generator**
Implementation details:
.. code:: python
from verl.single_controller.base import Worker
from verl.single_controller.ray import RayWorkerGroup, RayClassWithInitArgs, RayResourcePool
import ray
@ray.remote
class SampleGenerator(Worker):
def __init__(self, config):
super().__init__()
self.config = config
def generate_sequences(self, data):
pass
Here, ``SampleGenerator`` can be viewed as a multi-process pulled up by
``torchrun``, with each process running the same code (SPMD).
``SampleGenerator`` needs to implement a ``generate_sequences`` API for
the control flow to call. The implementation details inside can use any
inference engine including vllm, sglang and huggingface. Users can
largely reuse the code in
verl/verl/workers/rollout/vllm_rollout/vllm_rollout.py and we won't
go into details here.
**ReferencePolicy inference**
API: compute reference log probability
.. code:: python
from verl.single_controller.base import Worker
import ray
@ray.remote
class ReferencePolicy(Worker):
def __init__(self):
super().__init__()
self.model = Model()
def infer(self, data):
return self.model(data)
**Actor update**
API: Update actor model parameters
.. code:: python
from verl.single_controller.base import Worker
import ray
@ray.remote
class DPOActor(Worker):
def __init__(self):
super().__init__()
self.model = Model()
self.model = FSDP(self.model) # or other distributed strategy
self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
self.loss_fn = xxx
def update(self, data):
self.optimizer.zero_grad()
logits = self.model(data)
loss = self.loss_fn(logits)
loss.backward()
self.optimizer.step()
**Notes: How to distinguish between control processes and distributed computation processes**
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
- Control processes are generally functions directly decorated with
``@ray.remote``
- Computation processes are all wrapped into a ``RayWorkerGroup``.
Users can reuse most of the distribtued computation logics implemented
in PPO algorithm, including FSDP and Megatron-LM backend in
verl/verl/trainer/ppo.
Step 2: Based on different distributed scenarios, implement single-process control of multi-process data interaction
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
**The core problem to solve here is how a single process sends data to
multiple processes, drives multi-process computation, and how the
control process obtains the results of multi-process computation.**
First, we initialize the multi-process ``WorkerGroup`` in the control
process.
.. code:: python
@ray.remote(num_cpus=1)
def main_task(config):
# construct SampleGenerator
resource_pool = RayResourcePool(process_on_nodes=[8] * 2) # 16 GPUs
ray_cls = RayClassWithInitArgs(SampleGenerator, config=config)
# put SampleGenerator onto resource pool
worker_group = RayWorkerGroup(resource_pool, ray_cls)
# construct reference policy
As we can see, in the control process, multiple processes are wrapped
into a ``RayWorkerGroup``. Inside this ``WorkerGroup``, there is a
``self._workers`` member, where each worker is a RayActor
(https://docs.ray.io/en/latest/ray-core/actors.html) of SampleGenerator.
ray_trainer.md also provide an implementation of
``MegatronRayWorkerGroup``.
Assuming the model is distributed using FSDP, and there is a batch of
data on the control process, for data parallelism, the underlying
calling process is:
.. code:: python
data = xxx
data_list = data.chunk(dp_size)
output = []
for d in data_list:
# worker_group._workers[i] is a SampleGenerator
output.append(worker_group._workers[i].generate_sequences.remote(d))
output = ray.get(output)
output = torch.cat(output)
Single process calling multiple processes involves the following 3
steps:
1. Split the data into DP parts on the control process.
2. Send the data to remote, call the remote computation through RPC, and
utilize multi-process computation.
3. Obtain the computation results of each worker on the control process
and merge them.
Frequently calling these 3 steps on the controller process greatly hurts
code readability. **In verl, we have abstracted and encapsulated these 3
steps, so that the worker's method + dispatch + collect can be
registered into the worker_group**
.. code:: python
from verl.single_controller.base.decorator import register
def dispatch_data(worker_group, data):
return data.chunk(worker_group.world_size)
def collect_data(worker_group, data):
return torch.cat(data)
dispatch_mode = {
'dispatch_fn': dispatch_data,
'collect_fn': collect_data
}
@register(dispatch_mode=dispatch_mode)
def generate_sequences(self, data):
pass
In this way, we can directly call the method inside the worker through
the ``worker_group`` on the control (driver) process (which is a single
process):
.. code:: python
output = worker_group.generate_sequences(data)
This single line includes data splitting, data distribution and
computation, and data collection.
Furthermore, the model parallelism size of each model is usually fixed,
including dp, tp, pp. So for these common distributed scenarios, we have
pre-implemented specific dispatch and collect methods,in `decorator.py <https://github.com/volcengine/verl/blob/main/verl/single_controller/base/decorator.py>`_, which can be directly used to wrap the computations.
.. code:: python
from verl.single_controller.base.decorator import register, Dispatch
@register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
def generate_sequences(self, data: DataProto) -> DataProto:
pass
Here it requires the data interface to be ``DataProto``. Definition of
``DataProto`` is in `protocol.py <https://github.com/volcengine/verl/blob/main/verl/protocol.py>`_.
Step 3: Main training loop
~~~~~~~~~~~~~~~~~~~~~~~~~~
With the above training flows, we can implement the algorithm's control
flow. It is recommended that ``main_task`` is also a ray remote process.
.. code:: python
@ray.remote(num_cpus=1)
def main_task(config):
# construct SampleGenerator
resource_pool = RayResourcePool(process_on_nodes=[8] * 2) # 16 GPUs
ray_cls = RayClassWithInitArgs(SampleGenerator, config=config)
# put SampleGenerator onto resource pool
sample_gen = RayWorkerGroup(resource_pool, ray_cls)
# construct reference policy
ray_cls = RayClassWithInitArgs(ReferencePolicy)
ref_policy = RayWorkerGroup(resource_pool, ray_cls)
# construct actor
ray_cls = RayClassWithInitArgs(DPOActor)
dpo_policy = RayWorkerGroup(resource_pool, ray_cls)
dataloader = DataLoader()
for data in dataloader:
# generate data
data = sample_gen.generate_sequences(data)
# generate scores for each data
data = generate_scores(data)
# generate pairwise data using scores
data = generate_pairwise_data(data)
# generate ref_log_prob
data.batch['ref_log_prob'] = ref_policy.infer(data)
# update using dpo
dpo_policy.update(data)
# logging
Here, different ``WorkerGroups`` can be placed in the same resource pool or
in different resource pools using ``create_colocated_worker_cls``
similar as in `ray_trainer.py <https://github.com/volcengine/verl/blob/main/verl/trainer/ppo/ray_trainer.py>`_.
Add models with the FSDP backend
==================================
Model
--------------------------
In principle, our FSDP backend can support any HF model and we can
sychronoize the actor model weight with vLLM using `hf_weight_loader.py` under `third_party/vllm`.
However, ``hf_weight_loader`` is will gather the full state_dict of a
model during synchronization, which may cause OOM. We suggest using
``dtensor_weight_loader`` which gather the full model parameter layer by
layer to reduce the peak memory usage. We already support dtensor weight
loader for the models below in `dtensor_weight_loader.py` under `third_party/vllm`:
- ``GPT2LMHeadModel``
- ``LlamaForCausalLM``
- ``LLaMAForCausalLM``
- ``MistralForCausalLM``
- ``InternLMForCausalLM``
- ``AquilaModel``
- ``AquilaForCausalLM``
- ``Phi3ForCausalLM``
- ``GemmaForCausalLM``
- ``Gemma2ForCausalLM``
- ``GPTBigCodeForCausalLM``
- ``Starcoder2ForCausalLM``
- ``Qwen2ForCausalLM``
- ``DeepseekV2ForCausalLM``
To implement ``dtensor_weight_loader`` of a model that's supported in
vLLM, follow the guide of gemma model below:
1. Copy the
``load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]])`` from the vllm model class
to ``dtensor_weight_loaders.py``
2. Modify the arguments to
``(actor_weights: Dict, vllm_model: nn.Module)``
3. Replace the ``self`` to ``vllm_model``
4. Add the
``local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)``
before each ``param = params_dict[name]`` and modify the following
weight loading using ``local_loaded_weight``.
5. Register the implemented dtensor weight loader to ``__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__``.
.. code-block:: diff
- def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+ def gemma_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
stacked_params_mapping = [
# (param_name, shard_name, shard_id)
("qkv_proj", "q_proj", "q"),
("qkv_proj", "k_proj", "k"),
("qkv_proj", "v_proj", "v"),
("gate_up_proj", "gate_proj", 0),
("gate_up_proj", "up_proj", 1),
]
- params_dict = dict(self.named_parameters())
+ params_dict = dict(vllm_model.named_parameters())
loaded_params = set()
- for name, loaded_weight in weights:
+ for name, loaded_weight in actor_weights.items():
for (param_name, shard_name, shard_id) in stacked_params_mapping:
if shard_name not in name:
continue
name = name.replace(shard_name, param_name)
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
param = params_dict[name]
weight_loader = param.weight_loader
- weight_loader(param, loaded_weight, shard_id)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
break
else:
# lm_head is not used in vllm as it is tied with embed_token.
# To prevent errors, skip loading lm_head.weight.
if "lm_head.weight" in name:
continue
# Skip loading extra bias for GPTQ models.
if name.endswith(".bias") and name not in params_dict:
continue
+ local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
param = params_dict[name]
weight_loader = getattr(param, "weight_loader",
default_weight_loader)
- weight_loader(param, loaded_weight)
+ weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
loaded_params.add(name)
unloaded_params = params_dict.keys() - loaded_params
if unloaded_params:
raise RuntimeError(
"Some weights are not initialized from checkpoints: "
f"{unloaded_params}")
\ No newline at end of file
Add models with the Megatron-LM backend
=========================================
Model
-----------
If use latest verl, we have direct support of ``GPTModel`` for Megatron backend.
You can use the similar way of using Megatron to pretrain custom models.
We list the steps here:
1. Find `model_initializer.py <https://github.com/volcengine/verl/blob/main/verl/models/mcore/model_initializer.py>`_
2. If your model is configurable by ``TransformerLayerSpec`` , you can
directly use ``GPTModel``. Otherwise, Please implement a new
``ModelLayerSpec`` and ``ModelLayer`` here.
3. Use the right ``LayerSpec`` , ``TransformerConfig`` and ``HuggingfaceConfig``
as arguments to initialize the GPTModel.
4. Return the model at last.
Add Models with old version of verl
-----------------------------------
The most challenging aspect to use the Megatron-LM backend is implementing
the models for training. Currently, we implement Llama model that
support data parallelism, tensor parallelism, pipeline parallelism (also
vPP) and sequence parallelism. We also implement remove padding (sequence packing) on Llama
model, which can be found in `modeling_llama_megatron.py <https://github.com/volcengine/verl/blob/main/verl/models/llama/megatron/modeling_llama_megatron.py>`_.
To support other model, users are required to implement:
1. Implemnt a model similar to ``modeling_llama_megatron.py`` that satisfy the
parallelism requirements of Megatron-LM. Then register your model in
the `registry.py <https://github.com/volcengine/verl/blob/main/verl/models/registry.py>`_.
2. Checkpoint utils that can load full checkpoint (e.g. huggingface
checkpoint) to partitioned models during the runtime. Then register
your loader to ``weight_loader_registry`` in `weight_loader_registry.py <https://github.com/volcengine/verl/blob/main/verl/models/weight_loader_registry.py>`_.
3. Weight loader that synchronize the weight from Megatron to rollout
(vLLM) model. Note that both the actor model and rollout model are
partitioned during runtime. So, it's advisable to map the model name
in actor model implementation. Otherwise, you may need an additional
name mapping and even weight transformation. The weight loader implementation
is in `megatron_weight_loaders.py <https://github.com/volcengine/verl/blob/main/verl/third_party/vllm/vllm_v_0_6_3/megatron_weight_loaders.py>`_.
\ No newline at end of file
Ray API Design Tutorial
=======================================
We provide a tutorial for our Ray API design, including:
- Ray basic concepts
- Resource Pool and RayWorkerGroup
- Data Dispatch, Execution and Collection
- Initialize the RayWorkerGroup and execute the distributed computation in the given Resource Pool
See details in `tutorial.ipynb <https://github.com/volcengine/verl/blob/main/examples/ray/tutorial.ipynb>`_.
\ No newline at end of file
RL(HF) algorithms with LoRA Support
===========================================
We support LoRA (Low-Rank Adaptation) for reinforcement learning algorithms such as PPO, GRPO, and others.
LoRA is a parameter-efficient fine-tuning technique that injects trainable low-rank matrices into pre-trained weights (typically linear layers). This reduces memory footprint and compute cost, making it possible to fine-tune large models with limited hardware.
The benefits this brings include:
- reinforcement learning with very large models (e.g. 70B+) with modest hardware (e.g. 8x80G GPUs),
- enable larger batch sizes due to reduced memory usage,
- simplify model transfer and deployment, as only LoRA adapters need to be saved,
- Combine with techniques like `SLoRA <https://arxiv.org/abs/2311.03285>`_ or `CCoE <https://arxiv.org/abs/2407.11686>`_ to serve multiple LoRA adapters efficiently
This guide explains how to enable LoRA in RL training and configure related parameters.
Usage Guide
------------------------
1. Lora is available in the `verl.trainer.ppo.ray_trainer.RayPPOTrainer`. Examples are provided via the `verl.trainer.main_ppo` entry point.
2. Currently, LoRA is supported via huggingface peft, only with fsdp/fsdp2 and vllm backend (sglang support coming soon).
- `strategy=fsdp` or `strategy=fsdp2`
- `rollout.name=vllm`
3. Required configurations for LoRA:
- `actor_rollout_ref.model.lora_rank`: int, set to a reasonable value greater than 0 (e.g., 8, 16, 32, 64)
- `actor_rollout_ref.model.lora_alpha`: float, the alpha term in LoRA
- `actor_rollout_ref.rollout.load_format="safetensors"`: required. This enables vLLM to load the base model.
- `actor_rollout_ref.model.target_modules`: the target modules for LoRA. Typically set to "all-linear".
4. Recommend options:
- `actor_rollout_ref.model.use_shm=True`: preload the model into `/dev/shm` to improve model loading speed.
- `actor_rollout_ref.rollout.layered_summon=True`: this enables the actor-model to gather the FSDP shards per layers when synchronizing the LoRA Adapter to vLLM, thereby reducing GPU peak memory. Recommended if the model is very large (70B+) or the GPU memory is limited (< 48GB)
Best Practices and Notes
-------------------------
1. **Learning rate**: it is recommended to increase the value of learning rate by an order of magnitude.
2. **LoRA Rank**:
- Too small a rank can hurt convergence.
- LoRA rank recommendation from @thelongestusernameofall:
- A very small lora_rank can lead to slower convergence or worse training performance. It is recommended to set lora_rank to be>=32. Tests have shown that for a 0.5B model, with lora_rank=32,the training convergence speed and final performance are almost identical to non-LoRA training
- For a 32B model,with lora_rank=128,the training convergence speed and final performance are also almost identical to non-LoRA training.
- More comprehensive reference results are coming soon.
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/f2b80b8b26829124dd393b7a795a0640eff11644/docs/lora.jpg?raw=true
3. Reference configuration for RL training with the Qwen2.5-72B model using 8 x 80GB GPUs (increase lora_rank if needed):
.. code-block::
data.train_batch_size=64 \
actor_rollout_ref.model.use_shm=True \
actor_rollout_ref.model.lora_rank=32 \
actor_rollout_ref.model.lora_alpha=32 \
actor_rollout_ref.model.target_modules=all-linear \
actor_rollout_ref.actor.optim.lr=3e-5 \
actor_rollout_ref.actor.fsdp_config.fsdp_size=8 \
actor_rollout_ref.actor.fsdp_config.param_offload=True \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
actor_rollout_ref.rollout.tensor_model_parallel_size=8 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.rollout.max_num_seqs=64 \
actor_rollout_ref.rollout.max_model_len=1536 \
actor_rollout_ref.rollout.max_num_batched_tokens=1536 \
actor_rollout_ref.rollout.load_format=safetensors \
actor_rollout_ref.rollout.layered_summon=True \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
Example Script
-------------------
For an end-to-end example, refer to the script below:
examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora.sh
RoPE Scaling override
=======================================
Some models such as `Qwen/Qwen2.5-7B-Instruct <https://huggingface.co/Qwen/Qwen2.5-7B-Instruct#processing-long-texts>`_ support RoPE Scaling but don't have it defined in their config.json file.
For example, this model supports this configuration:
.. code:: python
{
...,
"rope_scaling": {
"factor": 4.0,
"original_max_position_embeddings": 32768,
"type": "yarn"
}
}
In order to support a longer context for such models, you must override the model configs when starting the trainer.
PPO example:
.. code:: bash
+actor_rollout_ref.model.override_config.rope_scaling.type=yarn \
+actor_rollout_ref.model.override_config.rope_scaling.factor=4.0 \
+actor_rollout_ref.model.override_config.rope_scaling.original_max_position_embeddings=32768 \
And for the critic model
.. code:: bash
+critic.model.override_config.rope_scaling.type=yarn \
+critic.model.override_config.rope_scaling.factor=4.0 \
+critic.model.override_config.rope_scaling.original_max_position_embeddings=32768 \
# Algorithm Baselines
## Math related datasets
Assuming GSM8k/math dataset is preprocessed via:
```bash
python3 examples/data_preprocess/*.py
```
Refer to the table below to reproduce RL training from different pre-trained checkpoints. Below is the performance on the GSM8k dataset if not specified otherwise. More comprehensive benchmark results areavailable in the recipe folder.
| Hardware | Model | Method | Test score | Details |
|-------------|----------------------------------|-------------------|--------------|---------|
| NVIDIA GPU | google/gemma-2-2b-it | hf checkpoint | 23.9 | [Huggingface](https://huggingface.co/google/gemma-2-2b-it#benchmark-results) |
| NVIDIA GPU | google/gemma-2-2b-it | SFT | 52.06 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/gemma-2-2b-it-sft-0.411.log) |
| NVIDIA GPU | google/gemma-2-2b-it | SFT + PPO | 64.02 | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/gemma-2-2b-it-ppo-bsz512_4-prompt1024-resp-512-0.640.log), [wandb](https://api.wandb.ai/links/verl-team/h7ux8602) |
| NVIDIA GPU | Qwen/Qwen2.5-0.5B-Instruct | hf checkpoint | 36.4 | [Qwen blog](https://qwenlm.github.io/blog/qwen2.5-llm/) |
| NVIDIA GPU | Qwen/Qwen2.5-0.5B-Instruct | PPO | 56.7 | [command and log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log) |
| NVIDIA GPU | Qwen/Qwen2.5-0.5B-Instruct | PRIME | 58.7 | [script](https://github.com/volcengine/verl/blob/main/recipe/prime/run_prime_qwen.sh), [wandb](https://api.wandb.ai/links/zefan-wang-thu-tsinghua-university/rxd1btvb) |
| NVIDIA GPU | deepseek-ai/deepseek-llm-7b-chat | PPO (Megatron) | 69.5 [1] | [log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/deepseek-llm-7b-chat-megatron-bsz256_4-prompt512-resp512-0.695.log), [wandb](https://wandb.ai/verl-team/verl_megatron_gsm8k_examples/runs/10fetyr3) |
| NVIDIA GPU | Qwen/Qwen2-7B-Instruct | GRPO | 89 | [script](https://github.com/volcengine/verl/blob/a65c9157bc0b85b64cd753de19f94e80a11bd871/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh) |
| NVIDIA GPU | Qwen/Qwen2-7B-Instruct | GRPO (FSDP2) | 89.8 | [log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/qwen2-7b-fsdp2.log) |
| NVIDIA GPU | Qwen/Qwen2-7B-Instruct | GRPO (Megatron) | 89.6 | [log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/qwen2-7b_math_megatron.log) |
| NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct | ReMax | 97 | [script](https://github.com/eric-haibin-lin/verl/blob/main/examples/remax_trainer/run_qwen2.5-3b_seq_balance.sh), [wandb](https://wandb.ai/liziniu1997/verl_remax_example_gsm8k/runs/vxl10pln) |
| NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct | SPPO | 65.6 (MATH) | [SPPO script](https://github.com/volcengine/verl/tree/main/recipe/sppo/README.md) |
| NVIDIA GPU | Mixtral-8x22B-Instruct-v0.1 | Instruct model | 83.7 | [Qwen Blog](https://qwenlm.github.io/blog/qwen2.5-llm/) |
| NVIDIA GPU | Mixtral-8x22B-Instruct-v0.1 | RLOO (Megatron) | 92.3 | [wandb](https://api.wandb.ai/links/ppo_dev/sbuiuf2d) |
| NVIDIA GPU | Qwen/Qwen2.5-7B-Instruct | SPIN | 92 | [script](https://github.com/volcengine/verl/tree/main/recipe/spin/README.md) |
| NVIDIA GPU | Qwen/Qwen2.5-VL-7B-Instruct | GRPO (Megatron) | 65.4 (GEO3k) | [script](https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh), [wandb](https://api.wandb.ai/links/megatron-core-moe-dev/1yngvkek) |
| AMD MI300 | deepseek-ai/deepseek-llm-7b-chat | PPO | 70.5 [1] | [log](https://github.com/yushengsu-thu/verl_training_log/blob/main/gsm8k/ppo_run_deepseek7b_llm.log) |
| AMD MI300 | deepseek-ai/deepseek-llm-7b-chat | GRPO | 71.4 [1] | [log](https://github.com/yushengsu-thu/verl_training_log/blob/main/gsm8k/grpo_run_deepseek7b_llm.log) |
## Coding related datasets
Below is the result on leetcode if not specified otherwise.
| Hardware | Model | Method | Test score | Details |
|-------------|----------------------------------|-------------------|--------------|---------|
| NVIDIA GPU | PRIME-RL/Eurus-2-7B-SFT | RPIME | 36.1 | [script](https://github.com/volcengine/verl/blob/main/recipe/prime/run_prime_qwen_code.sh), [swanlab](https://swanlab.cn/@wangzefan/prime_example/runs/7f541qhspgmy8nmhdlx35/chart) |
### Notes
[1] During evaluation, we have only extracted answers following the format `"####"`. A more flexible answer extraction, longer response length, and better prompt engineering may lead to a higher score.
[2] The default value of `actor_rollout_ref.actor.entropy_coeff` is set to `0.0` since verl 0.3.x on 2025-05-30, which is different from previous versions.
\ No newline at end of file
# Recipe: Decoupled Clip and Dynamic Sampling Policy Optimization (DAPO)
> Open-Source Algorithm Implementation & Expriement Running: [Yuxuan Tong](https://tongyx361.github.io/), [Guangming Sheng](https://hk.linkedin.com/in/guangming-sheng-b50640211)
🏠 [Homepage](https://dapo-sia.github.io/) | 📝 [Paper](https://dapo-sia.github.io/static/pdf/dapo_paper.pdf) | 🤗 [Datasets&Models@HF](https://huggingface.co/collections/BytedTsinghua-SIA/dapo-67d7f1517ee33c8aed059da0) | 🐱 [Code@GitHub](https://github.com/volcengine/verl/tree/gm-tyx/puffin/main/recipe/dapo) | 🐱 [Repo@GitHub](https://github.com/BytedTsinghua-SIA/DAPO)
> We propose the **D**ecoupled Clip and Dynamic s**A**mpling **P**olicy **O**ptimization (DAPO) algorithm. By making our work publicly available, we provide the broader research community and society with practical access to scalable reinforcement learning, enabling all to benefit from these advancements. Applying DAPO training to Qwen2.5-32B base model proves to outperform the previous state-of-the-art DeepSeek-R1-Zero-Qwen-32B on AIME 2024, achieving **50%** accuracy with **50%** less training steps.
>
> ![dapo-main-result](https://dapo-sia.github.io/static/images/score.png)
## Quickstart
1. Prepare the datasets **on the Ray cluster**:
```bash
bash prepare_dapo_data.sh # This downloads the datasets to ${HOME}/verl/data by default
```
2. Submit the job to the Ray cluster **from any machine**:
```bash
cd verl # Repo root
export RAY_ADDRESS="http://${RAY_IP:-localhost}:8265" # The Ray cluster address to connect to
export WORKING_DIR="${PWD}" # The local directory to package to the Ray cluster
# Set the runtime environment like env vars and pip packages for the Ray cluster in yaml
export RUNTIME_ENV="./verl/trainer/runtime_env.yaml"
bash recipe/dapo/run_dapo_qwen2.5_32b.sh
```
## Reproduction Runs
| Setup | AIME 2024 Acc. | Training Script | Training Record |
| -------------------------------------------- | -------------- | ---------------------------------------------------------------- | ----------------------------------------------------------------------------------------- |
| DAPO w/o Token-level Loss & Dynamic Sampling | 44% | [run_dapo_early_qwen2.5_32b.sh](./run_dapo_early_qwen2.5_32b.sh) | [W&B](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=wmb4qxfht0n) |
| DAPO w/o Dynamic Sampling | 50% | [run_dapo_wo_ds_qwen2.5_32b.sh](./run_dapo_wo_ds_qwen2.5_32b.sh) | [W&B](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=wmb4qxfht0n) |
| DAPO | 52% | [run_dapo_qwen2.5_32b.sh](./run_dapo_qwen2.5_32b.sh) | [W&B](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=wmb4qxfht0n) |
## Configuration
> [!NOTE]
> Most experiments in the paper, including the best-performant one, are run without Overlong Filtering because it's somehow overlapping with Overlong Reward Shaping in terms of properly learning from the longest outputs. So we don't implement it here.
### Separated Clip Epsilons (-> Clip-Higher)
An example configuration:
```yaml
actor_rollout_ref:
actor:
clip_ratio_low: 0.2
clip_ratio_high: 0.28
```
`clip_ratio_low` and `clip_ratio_high` specify the $\varepsilon_{\text {low }}$ and $\varepsilon_{\text {high }}$ in the DAPO objective.
Core relevant code:
```python
pg_losses1 = -advantages * ratio
pg_losses2 = -advantages * torch.clamp(ratio, 1 - cliprange_low, 1 + cliprange_high)
pg_losses = torch.maximum(pg_losses1, pg_losses2)
```
### Dynamic Sampling (with Group Filtering)
An example configuration:
```yaml
data:
gen_batch_size: 1536
train_batch_size: 512
algorithm:
filter_groups:
enable: True
metric: acc # score / seq_reward / seq_final_reward / ...
max_num_gen_batches: 10 # Non-positive values mean no upper limit
```
Setting `filter_groups.enable` to `True` will filter out groups whose outputs' `metric` are all the same, e.g., for `acc`, groups whose outputs' accuracies are all 1 or 0.
The trainer will repeat sampling with `gen_batch_size` until there are enough qualified groups for `train_batch_size` or reaching the upper limit specified by `max_num_gen_batches`.
Core relevant code:
```python
prompt_bsz = self.config.data.train_batch_size
if num_prompt_in_batch < prompt_bsz:
print(f'{num_prompt_in_batch=} < {prompt_bsz=}')
num_gen_batches += 1
max_num_gen_batches = self.config.algorithm.filter_groups.max_num_gen_batches
if max_num_gen_batches <= 0 or num_gen_batches < max_num_gen_batches:
print(f'{num_gen_batches=} < {max_num_gen_batches=}. Keep generating...')
continue
else:
raise ValueError(
f'{num_gen_batches=} >= {max_num_gen_batches=}. Generated too many. Please check your data.'
)
else:
# Align the batch
traj_bsz = self.config.data.train_batch_size * self.config.actor_rollout_ref.rollout.n
batch = batch[:traj_bsz]
```
### Flexible Loss Aggregation Mode (-> Token-level Loss)
An example configuration:
```yaml
actor_rollout_ref:
actor:
loss_agg_mode: "token-mean" # / "seq-mean-token-sum" / "seq-mean-token-mean"
# NOTE: "token-mean" is the default behavior
```
Setting `loss_agg_mode` to `token-mean` will mean the (policy gradient) loss across all the tokens in all the sequences in a mini-batch.
Core relevant code:
```python
if loss_agg_mode == "token-mean":
loss = verl_F.masked_mean(loss_mat, loss_mask)
elif loss_agg_mode == "seq-mean-token-sum":
seq_losses = torch.sum(loss_mat * loss_mask, dim=-1) # token-sum
loss = torch.mean(seq_losses) # seq-mean
elif loss_agg_mode == "seq-mean-token-mean":
seq_losses = torch.sum(loss_mat * loss_mask, dim=-1) / torch.sum(loss_mask, dim=-1) # token-mean
loss = torch.mean(seq_losses) # seq-mean
else:
raise ValueError(f"Invalid loss_agg_mode: {loss_agg_mode}")
```
### Overlong Reward Shaping
An example configuration:
```yaml
data:
max_response_length: 20480 # 16384 + 4096
reward_model:
overlong_buffer:
enable: True
len: 4096
penalty_factor: 1.0
```
Setting `overlong_buffer.enable` to `True` will penalize the outputs whose lengths are overlong but still within the hard context limit.
Specifically, the penalty increases linearly from `0` to `overlong_buffer.penalty_factor` when the length of the output exceeds the `max_response_length` by `0` to `overlong_buffer.len` tokens.
Core relevant code:
```python
if self.overlong_buffer_cfg.enable:
overlong_buffer_len = self.overlong_buffer_cfg.len
expected_len = self.max_resp_len - overlong_buffer_len
exceed_len = valid_response_length - expected_len
overlong_penalty_factor = self.overlong_buffer_cfg.penalty_factor
overlong_reward = min(-exceed_len / overlong_buffer_len * overlong_penalty_factor, 0)
reward += overlong_reward
```
# Group Relative Policy Optimization (GRPO)
In reinforcement learning, classic algorithms like PPO rely on a "critic" model to estimate the value of actions, guiding the learning process. However, training this critic model can be resource-intensive.
GRPO simplifies this process by eliminating the need for a separate critic model. Instead, it operates as follows:
- Group Sampling: For a given problem, the model generates multiple possible solutions, forming a "group" of outputs.
- Reward Assignment: Each solution is evaluated and assigned a reward based on its correctness or quality.
- Baseline Calculation: The average reward of the group serves as a baseline.
- Policy Update: The model updates its parameters by comparing each solution's reward to the group baseline, reinforcing better-than-average solutions and discouraging worse-than-average ones.
This approach reduces computational overhead by avoiding the training of a separate value estimation model, making the learning process more efficient. For more details, refer to the original paper [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://arxiv.org/pdf/2402.03300)
## Key Components
- No Value Function (Critic-less): unlike PPO, GRPO does not train a separate value network (critic)
- Group Sampling (Grouped Rollouts): instead of evaluating one rollout per input, GRPO generates multiple completions (responses) from the current policy for each prompt. This set of completions is referred to as a group.
- Relative Rewards: within each group, completions are scored (e.g., based on correctness), and rewards are normalized relative to the group.
## Configuration
Note that all configs containing `micro_batch_size` are used to configure the maximum sample or token count per forward or backward pass to avoid GPU OOMs, whose value should not change algorithmic/convergence behavior.
Despite that many configurations start with the `ppo_` prefix, they work across different RL algorithms in verl, as the GRPO training loop is similar to that of PPO (without critic).
![image](https://github.com/user-attachments/assets/16aebad1-0da6-4eb3-806d-54a74e712c2d)
- `actor_rollout.ref.rollout.n`: For each prompt, sample n times. Default to 1. For GRPO, please set it to a value larger than 1 for group sampling.
- `data.train_batch_size`: The global batch size of prompts used to generate a set of sampled trajectories/rollouts. The number of responses/trajectories is `data.train_batch_size * actor_rollout.ref.rollout.n`
- `actor_rollout_ref.actor.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO actor updates. The ppo_mini_batch_size is a global size across all workers.
- `actor_rollout_ref.actor.ppo_epochs`: Number of epochs for GRPO updates on one set of sampled trajectories for actor
- `actor_rollout_ref.actor.clip_ratio`: The GRPO clip range. Default to 0.2
- `algorithm.adv_estimator`: Default is gae. Please set it to grpo instead
- `actor_rollout_ref.actor.loss_agg_mode`: Default is "token-mean". Options include "token-mean", "seq-mean-token-sum", "seq-mean-token-mean". The original GRPO paper takes the sample-level loss (seq-mean-token-mean), which may be unstable in long-CoT scenarios. All GRPO example scripts provided in verl uses the default configuration "token-mean" for loss aggregation instead.
Instead of adding KL penalty in the reward, GRPO regularizes by directly adding the KL divergence between the trained policy and the reference policy to the loss:
- `actor_rollout_ref.actor.use_kl_loss`: To use kl loss in the actor. When used, we are not applying KL in the reward function. Default is False. Please set it to True for GRPO.
- `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss. Default is 0.001.
- `actor_rollout_ref.actor.kl_loss_type`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. How to calculate the kl divergence between actor and reference policy. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html
## Advanced Extensions
### DrGRPO
[Understanding R1-Zero-Like Training: A Critical Perspective](https://arxiv.org/pdf/2503.20783) claims there's optimization bias in GRPO, which leads to artificially longer responses, especially for incorrect outputs. This inefficiency stems from the way GRPO calculates advantages using group-based reward normalization. Instead, DrGRPO aggregates token-level losses by normalizing with a global constant to eliminate length bias.
Configure the following to enable DrGRPO, with all other parameters the same as GRPO's:
- `actor_rollout_ref.actor.loss_agg_mode`: "seq-mean-token-sum-norm", which turns off seq-dim averaging
- `actor_rollout_ref.actor.use_kl_loss`: Please set it to False for DrGRPO
- `algorithm.norm_adv_by_std_in_grpo`: False, which turns off standard deviation norm
## Reference Example
Qwen2.5 GRPO training log and commands: [link](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/qwen2-7b-fsdp2.log)
```bash
bash examples/grpo_trainer/run_qwen3-8b.sh
```
For more reference performance, please see https://verl.readthedocs.io/en/latest/algo/baseline.html
# On-Policy RL with Optimal Reward Baseline (OPO)
Loose on-policy constraints and suboptimal baselines in reinforcement learning often lead to training instability such as large policy shifts and entropy collapse. OPO addresses these challenges by using exact on-policy training with the theretically optimal reward baseline for advantage estimation. It achieves lower policy shifts and higher output entropy, encouraging more diverse and less repetitive responses.
OPO uses group sampling to generate multiple outputs for each input like GRPO. Unlike group-based algorithms which typically use the mean reward of a group as its baseline, OPO employs a theoretically optimal baseline: the length-weighted reward of the group. It also omits the standard deviation normalization. By adopting these two key components, OPO enables the training of a single policy model with the objective of maximizing only the expected reward. For more detailes, refer to the original paper [On-Policy RL with Optimal Reward Baseline](https://arxiv.org/pdf/2505.23585).
## Key Components
- Exact On-Policy Training: always generates responses from the current policy, without using any pre-generated data or off-policy data.
- Optimal Reward Baseline: uses a length-weighted reward of the group as the baseline for normalizing the rewards.
## Configuration
To configure OPO within the framework, use the following YAML settings. These parameters are crucial for enabling exact on-policy training and activating the optimal reward baseline.
```yaml
algorithm:
adv_estimator: opo # Use OPO for optimal reward baseline
data:
train_batch_size: 1024
actor_rollout_ref:
actor:
ppo_mini_batch_size: 1024 # ppo_mini_batch_size should equal to train_batch_size to enable exact on-policy training
entropy_coeff: 0 # disable entropy regularization
use_kl_loss: False # disable kl regularization
kl_loss_coef: 0
```
## Advanced Extensions
OPO can also be extended to other algorithms like RLOO and Reinforce++. It just needs to adjust their configurations to enable exact on-policy training and incorporate the optimal length-weighted reward baseline with minimal modifications to their advantage estimation functions.
# Proximal Policy Optimization (PPO)
Proximal Policy Optimization (PPO) is a family of policy gradient methods for reinforcement learning, proposed by OpenAI in 2017. PPO strikes a balance between simplicity, stability, and performance, making it one of the most widely used algorithms in modern RL applications, including large-scale language model fine-tuning.
Traditional policy gradient methods like REINFORCE or Vanilla Policy Gradient suffer from:
- High variance and sample inefficiency.
- Instability due to large policy updates.
PPO addresses this problem using a clipped surrogate objective that avoids overly large updates without requiring second-order derivatives.
For more technical details regarding PPO, we suggest reading the introduction in the [OpenAI spinning up tutorial](https://spinningup.openai.com/en/latest/algorithms/ppo.html), and the paper [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347).
## Key Components
- Actor-Critic Architecture: PPO requires both an actor model (policy) and a critic model (value function). This differs from other algorithms like GRPO and RLOO that don't require a critic model.
- Generalized Advantage Estimation (GAE): PPO uses GAE for computing advantage values, which helps reduce variance in policy gradient estimates while maintaining low bias.
- Clipped Surrogate Objective: The core of PPO is implemented through the clipped surrogate objective function that limits policy updates.
## Configuration
Note that all configs containing `micro_batch_size` are used to configure the maximum sample or token count per forward or backward pass to avoid GPU OOMs, whose value should not change algorithmic/convergence behavior.
Most critic configs are similar to those of actors. Note that the critic model is omitted from the figure below.
![image](https://github.com/user-attachments/assets/16aebad1-0da6-4eb3-806d-54a74e712c2d)
- `data.train_batch_size`: The global batch size of prompts used to generate a set of sampled trajectories/rollouts. The number of responses/trajectories is `data.train_batch_size * actor_rollout.ref.rollout.n`
- `actor_rollout_ref.actor.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO actor updates. The ppo_mini_batch_size is a global size across all workers
- `actor_rollout_ref.critic.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO critic updates. The ppo_mini_batch_size is a global size across all workers
- `actor_rollout_ref.actor.clip_ratio`: The PPO clip range. Default to 0.2
- `actor_rollout_ref.actor.ppo_epochs`: Number of epochs for PPO updates on one set of sampled trajectories for actor
- `actor_rollout_ref.actor.ppo_epochs`: Number of epochs for PPO updates on one set of sampled trajectories for critic
- `algorithm.gemma`: discount factor
- `algorithm.lam`: The lambda term that trades off between bias and variance in the GAE estimator
- `algorithm.adv_estimator`: Support gae, grpo, reinforce_plus_plus, reinforce_plus_plus_baseline, rloo
## Advanced Extensions
### KL Divergence Control
Options to prevent the policy from diverging too far from a reference policy. Two mechanisms are available: KL reward penalty and KL loss. For more technical details, see [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
Options to use KL loss for KL divergence control:
- `actor_rollout_ref.actor.use_kl_loss`: to use kl loss in the actor. When used, we are not applying KL in the reward function. Default is False
- `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss. Default is 0.001.
- `actor_rollout_ref.actor.kl_loss_type`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. How to calculate the kl divergence between actor and reference policy. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html
Options to use KL penalty in the reward:
- `algorithm.use_kl_in_reward`: Whether to enable in-reward kl penalty. Default is False.
- `algorithm.kl_penalty`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. This defines the way to calculate the kl divergence between actor and reference policy. For specific options, refer to `kl_penalty` in core_algos.py. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html
- `algorithm.kl_ctrl.kl_coef`: The (initial) coefficient of in-reward kl_penalty. Default is 0.001.
- `algorithm.kl_ctrl.type`: 'fixed' for FixedKLController and 'adaptive' for AdaptiveKLController.
- `algorithm.kl_ctrl.horizon`: See source code of AdaptiveKLController for details.
- `algorithm.kl_ctrl.target_kl`: See source code of AdaptiveKLController for details.
### Dual-clip PPO
The Dual-Clip PPO introduces a approach by applying a lower bound to the policy ratio when the advantage is less than zero, when multiplied by a large raito, does not exceed a specified lower bound.
![image](https://github.com/user-attachments/assets/fc232181-d8b0-4307-8dd2-4dc0a4c1c139)
- `actor_rollout_ref.actor.clip_ratio_c`: lower bound of the value for Dual-clip PPO, defaults to 3.0
## Reference Example
Qwen2.5 training log and commands: [link](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log)
```bash
bash run_gemma.sh
trainer.n_gpus_per_node=1 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
trainer.logger=['console'] \
critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
data.train_batch_size=256 \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.ppo_micro_batch_size=2 \
critic.ppo_micro_batch_size=2
```
Reference performance with verl v0.2:
| Model | Method | Score | Link |
|-------------------------------|------------------|-------|------------------------------------------------------------------------------------------------|
| Qwen/Qwen2.5-0.5B-Instruct | pretrained model | 36.4 | [Qwen Blog](https://qwenlm.github.io/blog/qwen2.5-llm/) |
| Qwen/Qwen2.5-0.5B-Instruct | PPO | 56.7 | [PPO Command and Logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log) |
# Recipe: Self-Play Preference Optimization (SPPO)
verl provides a community recipe implementation for the paper [Self-Play Preference Optimization for Language Model Alignment](https://arxiv.org/abs/2405.00675). SPPO can significantly enhance the performance of an LLM without strong external signals such as responses or preferences from GPT-4. It can outperform the model trained with iterative direct preference optimization (DPO), among other methods. SPPO is theoretically grounded, ensuring that the LLM can converge to the von Neumann winner (i.e., Nash equilibrium) under general, potentially intransitive preference, and empirically validated through extensive evaluations on multiple datasets.
Paper Authors: [Yue Wu](https://yuewu.us/)\*, [Zhiqing Sun](https://www.cs.cmu.edu/~zhiqings/)\*, [Huizhuo Yuan](https://scholar.google.com/citations?user=8foZzX4AAAAJ)\*, [Kaixuan Ji](https://scholar.google.com/citations?user=FOoKDukAAAAJ), [Yiming Yang](https://www.cs.cmu.edu/~yiming/), [Quanquan Gu](https://web.cs.ucla.edu/~qgu/)
verl Implementation Authors: [Yuhao Yang](https://github.com/yhyang201), [Chenyang Zhao](https://github.com/zhaochenyang20)
[[Webpage](https://uclaml.github.io/SPPO/)] [[Huggingface](https://huggingface.co/papers/2405.00675)] [[Paper](https://arxiv.org/abs/2405.00675)][[Original Implementation](https://github.com/uclaml/SPPO)]
## Reproduce the Experiment
We evaluate the performance of SPPO on the MATH dataset. Starting from an initial score of 46.6 with Qwen2.5-7B-Instruct, we achieve a score of 65.6 after 20 epochs of training, placing our model approximately in the top 20 on the [MATH leaderboard](https://paperswithcode.com/sota/math-word-problem-solving-on-math). It's important to note that verl's internal evaluation metrics may not perfectly align with the official evaluation methodology for Qwen2.5-7B-Instruct. Therefore, for consistency and fair comparison, we report only the results based on verl's evaluation framework.
```
git clone git@github.com:volcengine/verl.git
cd verl
python3 -m uv pip install -e ".[sglang]"
export WANDB_API_KEY=<YOUR_WANDB_API_KEY>
python3 examples/data_preprocess/math_dataset.py --local_dir ~/data/math
huggingface-cli download Qwen/Qwen2.5-7B-Instruct --local-dir $HOME/models/Qwen2.5-7B-Instruct
export CUDA_VISIBLE_DEVICES=0,1,2,3
bash recipe/sppo/run_qwen2.5-7b_rm.sh
```
Note that the installation would occasionally fail to install flash-attn. If this happens, you can install it manually by running:
```bash
python3 -m uv pip install wheel
python3 -m uv pip install packaging
python3 -m uv pip install flash-attn --no-build-isolation --no-deps
```
## Acknowledgement
We sincerely thank the contribution and guidance from:
- [Yue Wu](https://yuewu.us/)
- [Chendong Wang](https://cdwang96.github.io/)
- [Yifan Zhang](https://github.com/yifanzhang-pro)
- [Yongan Xiang](https://github.com/BearBiscuit05)
- [Junrong Lin](https://github.com/ocss884)
- [Yuxuan Tong](https://github.com/tongyx361)
- [Guangming Shen](https://github.com/PeterSH6)
- [Biao He](https://www.linkedin.com/in/biao-he/)
- [Qingquan Song](https://qingquansong.github.io/)
- [Quanquan Gu](https://web.cs.ucla.edu/~qgu/)
# Setup
## Dockerfile.rocm
```bash
# Build the docker in the repo dir:
# docker build -f docker/Dockerfile.rocm -t verl-rocm:03.04.2015 .
# docker images # you can find your built docker
#
FROM rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
# Set working directory
# WORKDIR $PWD/app
# Set environment variables
ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
# Install vllm
RUN pip uninstall -y vllm && \
rm -rf vllm && \
git clone -b v0.6.3 https://github.com/vllm-project/vllm.git && \
cd vllm && \
MAX_JOBS=$(nproc) python3 setup.py install && \
cd .. && \
rm -rf vllm
# Copy the entire project directory
COPY . .
# Install dependencies
RUN pip install "tensordict<0.6" --no-deps && \
pip install accelerate \
codetiming \
datasets \
dill \
hydra-core \
liger-kernel \
numpy \
pandas \
peft \
"pyarrow>=15.0.0" \
pylatexenc \
"ray[data,train,tune,serve]" \
torchdata \
transformers \
wandb \
orjson \
pybind11 && \
pip install -e . --no-deps
```
## Build the image:
```bash
docker build -t verl-rocm .
```
## Run the container
```bash
docker run --rm -it \
--device /dev/dri \
--device /dev/kfd \
-p 8265:8265 \
--group-add video \
--cap-add SYS_PTRACE \
--security-opt seccomp=unconfined \
--privileged \
-v $HOME/.ssh:/root/.ssh \
-v $HOME:$HOME \
--shm-size 128G \
-w $PWD \
verl-rocm \
/bin/bash
```
# Example
## PPO
```bash
YOUR_PROJECT_NAME=r1-verl-ppo-upstream
YOUR_RUN_NAME=r1-training_ppo-upstream
# export HYDRA_FULL_ERROR=1
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export ROCR_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
GPUS_PER_NODE=8
MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct
python3 examples/data_preprocess/gsm8k.py --local_dir data/gsm8k
python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')"
PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
data.train_files=data/gsm8k/train.parquet \
data.val_files=data/gsm8k/test.parquet \
data.train_batch_size=256 \
data.val_batch_size=1312 \
data.max_prompt_length=512 \
data.max_response_length=256 \
actor_rollout_ref.model.path=$MODEL_PATH \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
critic.optim.lr=1e-5 \
critic.model.path=$MODEL_PATH \
critic.ppo_micro_batch_size_per_gpu=4 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.logger=['console'] \
trainer.project_name=$YOUR_PROJECT_NAME \
trainer.experiment_name=$YOUR_RUN_NAME \
+trainer.val_before_train=False \
trainer.default_hdfs_dir=null \
trainer.n_gpus_per_node=$GPUS_PER_NODE \
trainer.nnodes=1 \
trainer.save_freq=10 \
trainer.test_freq=10 \
trainer.total_epochs=15 #2>&1 | tee verl_demo.log
```
## GRPO
```bash
YOUR_PROJECT_NAME=r1-verl-grpo-upstream
YOUR_RUN_NAME=r1-training_grpo-upstream
# export HYDRA_FULL_ERROR=1
# export FSDP_VERBOSE=1
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export ROCR_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
GPUS_PER_NODE=8
MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct
# MODEL_PATH=Qwen/Qwen2-7B-Instruct
python3 examples/data_preprocess/gsm8k.py --local_dir data/gsm8k
python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')"
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=data/gsm8k/train.parquet \
data.val_files=data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.val_batch_size=1312 \
data.max_prompt_length=512 \
data.max_response_length=1024 \
actor_rollout_ref.model.path=$MODEL_PATH \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.use_dynamic_bsz=True \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=Flase \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.fsdp_config.param_offload=False \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name=$YOUR_PROJECT_NAME \
trainer.experiment_name=$YOUR_RUN_NAME \
trainer.n_gpus_per_node=$GPUS_PER_NODE \
+trainer.val_before_train=False \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=10 \
trainer.total_epochs=15
```
\ No newline at end of file
# Setup
## Docker:
Find the docker here: https://hub.docker.com/r/rocm/vllm/tags (rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4)
```bash
docker run --rm -it \
--device /dev/dri \
--device /dev/kfd \
--network host \
--ipc host \
--group-add video \
--cap-add SYS_PTRACE \
--security-opt seccomp=unconfined \
--privileged \
-v /home/yushensu:/home/yushensu \
-v $HOME/.ssh:/root/.ssh \
--shm-size 128G \
--name verl_vllm_upstream \
-w $PWD \
rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 \
/bin/bash
```
## Build ROCM vLLM:
```bash
pip uninstall -y vllm
git clone -b v0.6.3 https://github.com/vllm-project/vllm.git
cd vllm
export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
export MAX_JOBS=$(nproc)
# python3 setup.py develop # will not create src need to keep the repo
python3 setup.py install # will add src into py. You can delete the repo
cd ..
rm -rf vllm
```
## Install the require packages:
```bash
pip install "tensordict<0.6" --no-deps
pip install accelerate \
codetiming \
datasets \
dill \
hydra-core \
liger-kernel \
numpy \
pandas \
peft \
"pyarrow>=15.0.0" \
pylatexenc \
"ray[data,train,tune,serve]" \
torchdata \
transformers \
wandb \
orjson \
pybind11
pip install -e . --no-deps
```
# Example
## PPO
```bash
YOUR_PROJECT_NAME=r1-verl-ppo-upstream
YOUR_RUN_NAME=r1-training_ppo-upstream
# export HYDRA_FULL_ERROR=1
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export ROCR_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
GPUS_PER_NODE=8
MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct
python3 examples/data_preprocess/gsm8k.py --local_dir data/gsm8k
python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')"
PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
data.train_files=data/gsm8k/train.parquet \
data.val_files=data/gsm8k/test.parquet \
data.train_batch_size=256 \
data.val_batch_size=1312 \
data.max_prompt_length=512 \
data.max_response_length=256 \
actor_rollout_ref.model.path=$MODEL_PATH \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
critic.optim.lr=1e-5 \
critic.model.path=$MODEL_PATH \
critic.ppo_micro_batch_size_per_gpu=4 \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.logger=['console'] \
trainer.project_name=$YOUR_PROJECT_NAME \
trainer.experiment_name=$YOUR_RUN_NAME \
+trainer.val_before_train=False \
trainer.default_hdfs_dir=null \
trainer.n_gpus_per_node=$GPUS_PER_NODE \
trainer.nnodes=1 \
trainer.save_freq=10 \
trainer.test_freq=10 \
trainer.total_epochs=15 #2>&1 | tee verl_demo.log
```
## GRPO
```bash
YOUR_PROJECT_NAME=r1-verl-grpo-upstream
YOUR_RUN_NAME=r1-training_grpo-upstream
# export HYDRA_FULL_ERROR=1
# export FSDP_VERBOSE=1
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export ROCR_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
GPUS_PER_NODE=8
MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct
# MODEL_PATH=Qwen/Qwen2-7B-Instruct
python3 examples/data_preprocess/gsm8k.py --local_dir data/gsm8k
python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')"
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=data/gsm8k/train.parquet \
data.val_files=data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.val_batch_size=1312 \
data.max_prompt_length=512 \
data.max_response_length=1024 \
actor_rollout_ref.model.path=$MODEL_PATH \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.use_dynamic_bsz=True \
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=Flase \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.fsdp_config.param_offload=False \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name=$YOUR_PROJECT_NAME \
trainer.experiment_name=$YOUR_RUN_NAME \
trainer.n_gpus_per_node=$GPUS_PER_NODE \
+trainer.val_before_train=False \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=10 \
trainer.total_epochs=15
```
\ No newline at end of file
verl performance tuning for AMD (ROCm Kernel)
=====================================================
Author: `Yang Wang <https://github.com/YangWang92/>`_
Patch vLLM to Enable Sleep Mode for AMD GPUs
--------------------------------------------------------------
By default, verl requires vLLM to enable sleep mode, which allows vLLM to offload GPU memory to CPU memory after rollout. However, this feature is still under review by the vLLM community.
To enable vLLM's sleep mode, you can first use community patched code (from `this pull request <https://github.com/vllm-project/vllm/pull/12695>`_) to build vLLM from the source code in the corresponding pull request. After the patch merged in vLLM main branch, you can directly install vLLM from the latest version.
1. Clone the vLLM repository and build it with the following commands:
.. code-block:: bash
git clone -b sleep_amd https://github.com/HollowMan6/vllm.git
cd vllm
sudo ln -sf /opt/rocm/lib/libamdhip64.so /usr/lib/libamdhip64.so
VLLM_TARGET_DEVICE=rocm ROCM_PATH=/opt/rocm/ VLLM_GPU_LANG=HIP SETUPTOOLS_SCM_PRETEND_VERSION=0.8.4.dev python3 setup.py develop
2. Additionally, make sure to use the ROCm version in your Docker image lager than or equal to ROCm 6.3.4, and we recommend to use ROCm 6.4.0 for better performance (see `this comment <https://github.com/vllm-project/vllm/pull/12695#issuecomment-2637839574>`_).
After the upgrade, you can verify whether sleep mode is enabled by running the following test code (from `this comment <https://github.com/vllm-project/vllm/pull/12695#issuecomment-2637839574>`_).
.. code-block:: python
import torch
from vllm import LLM
llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enable_sleep_mode=True)
def run_inference(prompt):
outputs = llm.generate(prompt)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
print("CUDA Memory Usage (after inference):")
torch.cuda.empty_cache()
print(f"{torch.cuda.memory_allocated()=}")
run_inference("San Francisco is")
llm.sleep()
print("CUDA Memory Usage (after sleep):")
torch.cuda.empty_cache()
print(f"{torch.cuda.memory_allocated()=}")
llm.wake_up()
print("CUDA Memory Usage (after wakeup):")
torch.cuda.empty_cache()
print(f"{torch.cuda.memory_allocated()=}")
run_inference("Paris is")
If sleep mode is enabled, you should see the memory usage reduce after sleep.
After applying the vLLM patch and completing the installation, you can enable sleep mode in verl to reduce memory overhead. This allows verl to offload unused GPU memory during rollout, significantly lowering the memory footprint during long-context training or multi-node reinforcement learning.
Enable CUDA Graph and Bypass ROCm-related issues
--------------------------------------------------------------
Due to potential issues with CUDA graph capture in ROCm, we’ve found that vLLM’s CUDA graph feature cannot be enabled on multiple nodes in verl on AMD platforms with vLLM V1 mode. This leads to significantly slower rollout performance.
Our investigation shows that ROCm may trigger an unexpected crash when attempting to capture large batches with CUDA graph. One workaround is to patch the LLM configuration (from `this commit <https://github.com/volcengine/verl/blob/v0.3.0.rc0/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py#L100-L115>`_).
.. code-block:: python
self.inference_engine = LLM(
model=model_path,
enable_sleep_mode=True,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend="external_launcher",
dtype=config.dtype,
enforce_eager=config.enforce_eager,
gpu_memory_utilization=config.gpu_memory_utilization,
disable_custom_all_reduce=True,
disable_mm_preprocessor_cache=True,
limit_mm_per_prompt=limit_mm_per_prompt,
skip_tokenizer_init=False,
max_model_len=max_model_len,
load_format=load_format,
disable_log_stats=config.disable_log_stats,
max_num_batched_tokens=max_num_batched_tokens,
enable_chunked_prefill=config.enable_chunked_prefill,
enable_prefix_caching=True,
trust_remote_code=trust_remote_code,
# enable compilation config to bypass oom on rocm
# change depends on your GPU memory size
compilation_config={"cudagraph_capture_sizes": [1, 2, 4, 8, 16, 32, 64]},
seed=config.get('seed', 0),
)
Then, you can enable CUDA graph by setting the following environment variables (see `this page <https://github.com/volcengine/verl/blob/v0.3.0.rc0/docs/README_vllm0.8.md>`_):
.. code-block:: bash
actor_rollout_ref.rollout.enforce_eager=False \
actor_rollout_ref.rollout.free_cache_engine=False \
Data interface
=========================
DataProto is the interface for data exchange.
The :class:`verl.DataProto` class contains two key members:
- batch: a :class:`tensordict.TensorDict` object for the actual data
- meta_info: a :class:`Dict` with additional meta information
TensorDict
~~~~~~~~~~~~
:attr:`DataProto.batch` is built on top of :class:`tensordict`, a project in the PyTorch ecosystem.
A TensorDict is a dict-like container for tensors. To instantiate a TensorDict, you must specify key-value pairs as well as the batch size.
.. code-block:: python
>>> import torch
>>> from tensordict import TensorDict
>>> tensordict = TensorDict({"zeros": torch.zeros(2, 3, 4), "ones": torch.ones(2, 3, 5)}, batch_size=[2,])
>>> tensordict["twos"] = 2 * torch.ones(2, 5, 6)
>>> zeros = tensordict["zeros"]
>>> tensordict
TensorDict(
fields={
ones: Tensor(shape=torch.Size([2, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False),
twos: Tensor(shape=torch.Size([2, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False),
zeros: Tensor(shape=torch.Size([2, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)},
batch_size=torch.Size([2]),
device=None,
is_shared=False)
One can also index a tensordict along its batch_size. The contents of the TensorDict can be manipulated collectively as well.
.. code-block:: python
>>> tensordict[..., :1]
TensorDict(
fields={
ones: Tensor(shape=torch.Size([1, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False),
twos: Tensor(shape=torch.Size([1, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False),
zeros: Tensor(shape=torch.Size([1, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)},
batch_size=torch.Size([1]),
device=None,
is_shared=False)
>>> tensordict = tensordict.to("cuda:0")
>>> tensordict = tensordict.reshape(6)
For more about :class:`tensordict.TensorDict` usage, see the official tensordict_ documentation.
.. _tensordict: https://pytorch.org/tensordict/overview.html
Core APIs
~~~~~~~~~~~~~~~~~
.. autoclass:: verl.DataProto
:members: to, select, union, make_iterator, concat
Single Controller interface
============================
The Single Controller provides a unified interface for managing distributed workers
using Ray or other backends and executing functions across them.
It simplifies the process of dispatching tasks and collecting results, particularly
when dealing with data parallelism or model parallelism.
Core APIs
~~~~~~~~~~~~~~~~~
.. autoclass:: verl.single_controller.Worker
:members: __init__, __new__, get_master_addr_port, get_cuda_visible_devices, world_size, rank
.. autoclass:: verl.single_controller.WorkerGroup
:members: __init__, world_size
.. autoclass:: verl.single_controller.ClassWithInitArgs
:members: __init__, __call__
.. autoclass:: verl.single_controller.ResourcePool
:members: __init__, world_size, local_world_size_list, local_rank_list
.. autoclass:: verl.single_controller.ray.RayWorkerGroup
:members: __init__
.. autofunction:: verl.single_controller.ray.create_colocated_worker_cls
\ No newline at end of file
Trainer Interface
================================
Trainers drive the training loop. Introducing new trainer classes in case of new training paradiam is encouraged.
.. autosummary::
:nosignatures:
verl.trainer.ppo.ray_trainer.RayPPOTrainer
Core APIs
~~~~~~~~~~~~~~~~~
.. autoclass:: verl.trainer.ppo.ray_trainer.RayPPOTrainer
:members: __init__, init_workers, fit
.. automodule:: verl.utils.tokenizer
:members: hf_tokenizer
.. automodule:: verl.trainer.ppo.core_algos
:members: agg_loss, kl_penalty, compute_policy_loss, kl_penalty
.. automodule:: verl.trainer.ppo.reward
:members: load_reward_manager, compute_reward, compute_reward_async
.. autoclass:: verl.workers.reward_manager.NaiveRewardManager
.. autoclass:: verl.workers.reward_manager.DAPORewardManager
Utilities
============
This section documents the utility functions and classes in the VERL library.
Python Functional Utilities
------------------------------
.. automodule:: verl.utils.py_functional
:members: append_to_dict
File System Utilities
------------------------
.. automodule:: verl.utils.fs
:members: copy_to_local
Tracking Utilities
---------------------
.. automodule:: verl.utils.tracking
:members: Tracking
Metrics Utilities
---------------------
.. automodule:: verl.utils.metric
:members: reduce_metrics
Checkpoint Management
------------------------
.. automodule:: verl.utils.checkpoint.checkpoint_manager
:members: find_latest_ckpt_path
.. automodule:: verl.utils.checkpoint.fsdp_checkpoint_manager
:members: FSDPCheckpointManager
Dataset Utilities
---------------------
.. automodule:: verl.utils.dataset.rl_dataset
:members: RLHFDataset, collate_fn
Torch Functional Utilities
-----------------------------
.. automodule:: verl.utils.torch_functional
:members: get_constant_schedule_with_warmup, masked_whiten, masked_mean, logprobs_from_logits
Sequence Length Balancing
----------------------------
.. automodule:: verl.utils.seqlen_balancing
:members: get_reverse_idx, rearrange_micro_batches
Ulysses Utilities
--------------------
.. automodule:: verl.utils.ulysses
:members: gather_outpus_and_unpad, ulysses_pad_and_slice_inputs
FSDP Utilities
------------------
.. automodule:: verl.utils.fsdp_utils
:members: get_fsdp_wrap_policy, get_init_weight_context_manager, init_fn, load_fsdp_model_to_gpu, load_fsdp_optimizer, offload_fsdp_model_to_cpu, offload_fsdp_optimizer,
Debug Utilities
-------------------
.. automodule:: verl.utils.debug
:members: log_gpu_memory_usage, GPUMemoryLogger
verl x Ascend
===================================
我们在 verl 上增加对华为昇腾设备的支持。
硬件支持
-----------------------------------
Atlas 200T A2 Box16
Atlas 800T A2
安装
-----------------------------------
基础环境准备
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+-----------+-------------+
| software | version |
+-----------+-------------+
| Python | == 3.10 |
+-----------+-------------+
| CANN | == 8.1.RC1 |
+-----------+-------------+
| torch | == 2.5.1 |
+-----------+-------------+
| torch_npu | == 2.5.1.RC1|
+-----------+-------------+
vllm & vllm-ascend
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
为了能够在 verl 中正常使用 vllm,需使用以下命令编译安装 vllm 和 vllm-ascend。请注意根据机器类型区分安装方式。
.. code-block:: bash
# vllm
git clone -b v0.7.3 --depth 1 https://github.com/vllm-project/vllm.git
cd vllm
pip install -r requirements-build.txt
# for Atlas 200T A2 Box16
VLLM_TARGET_DEVICE=empty pip install -e . --extra-index https://download.pytorch.org/whl/cpu/
# for Atlas 800T A2
VLLM_TARGET_DEVICE=empty pip install -e .
.. code-block:: bash
# vllm-ascend
git clone -b v0.7.3 --depth 1 https://github.com/vllm-project/vllm-ascend.git
cd vllm-ascend
export COMPILE_CUSTOM_KERNELS=1
python setup.py install
安装verl
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
.. code-block:: bash
git clone https://github.com/volcengine/verl.git
cd verl
pip install -r requirements-npu.txt
pip install -e .
其他三方库说明
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+--------------+---------------+
| software | description |
+--------------+---------------+
| transformers | >= v4.52.0 |
+--------------+---------------+
| flash_attn | not supported |
+--------------+---------------+
| liger-kernel | not supported |
+--------------+---------------+
| tensordict | 0.8.3 (ARM) |
+--------------+---------------+
1. 支持通过 transformers 使能 --flash_attention_2, transformers 需大于等于 4.52.0版本。
2. 不支持通过 flash_attn 使能 flash attention 加速。
3. 不支持 liger-kernel 使能。
4. 针对 ARM 服务器,tensordict 要求 0.8.3,可在依赖安装完成后再手动安装 tensordict。
快速开始
-----------------------------------
正式使用前,建议您通过对Qwen2.5-0.5B GRPO的训练尝试以检验环境准备和安装的正确性。
1.下载数据集并将数据集预处理为parquet格式,以便包含计算RL奖励所需的必要字段
.. code-block:: bash
python3 examples/data_preprocess/gsm8k.py --local_dir ~/data/gsm8k
2.执行训练
.. code-block:: bash
set -x
export VLLM_ATTENTION_BACKEND=XFORMERS
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=128 \
data.max_prompt_length=512 \
data.max_response_length=128 \
data.filter_overlong_prompts=True \
data.truncation='error' \
actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
actor_rollout_ref.actor.optim.lr=5e-7 \
actor_rollout_ref.model.use_remove_padding=False \
actor_rollout_ref.actor.entropy_coeff=0.001 \
actor_rollout_ref.actor.ppo_mini_batch_size=64 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=20 \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.001 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=40 \
actor_rollout_ref.rollout.enable_chunked_prefill=False \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=40 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console'] \
trainer.project_name='verl_grpo_example_gsm8k' \
trainer.experiment_name='qwen2_7b_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=5 \
trainer.total_epochs=1 \
trainer.device=npu $@
支持现状
-----------------------------------
+-----------+----------------------+-------------+-------------------+----------------------+
| algorithm | model | rewards mae | throughput ratio | hardware |
+-----------+----------------------+-------------+-------------------+----------------------+
| GRPO | Qwen2.5-7B-instruct | 0.38% | 0.588 | Atlas 200T A2 Box16 |
+-----------+----------------------+-------------+-------------------+----------------------+
| GRPO | Qwen2.5-32B-instruct | 0.30% | 0.685 | Atlas 200T A2 Box16 |
+-----------+----------------------+-------------+-------------------+----------------------+
| DAPO | Qwen2.5-7B-instruct | 3.83% | pending | Atlas 200T A2 Box16 |
+-----------+----------------------+-------------+-------------------+----------------------+
目前支持 Qwen2.5 的 GRPO 训练,Qwen2.5-VL GRPO 训练在 vllm-ascend 的修复后支持,涉及到的issue为:
1. `issues#809 <https://github.com/vllm-project/vllm-ascend/issues/809>`_
2. `issues#825 <https://github.com/vllm-project/vllm-ascend/issues/825>`_
精度对比说明
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
对于 SFT 类算法,我们期望在相同配置下华为昇腾设备与 A100 的 loss 平均绝对误差<= 2%。计算方式如下图。更多信息请参考 `精度计算说明 <https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/LMaccuracy_0001.html>`_。
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/loss_comparison.png?raw=true
:alt: loss_comparison
根据经验,对于 GRPO 等 RL 类算法,我们期望在相同配置下华为昇腾设备与 A100 的 rewards 平均绝对误差<= 4%,计算方式参考上图。
吞吐对比说明
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Ascend npu 和 A100 分别取日志中前4个 step 的 "perf/throughput" 做平均, throughput ratio = npu 平均值 / A100 平均值。
计划
-----------------------------------
查看 `roadmap <https://github.com/volcengine/verl/discussions/900>`_ 获取更多特性的支持进度。
声明
-----------------------------------
verl中提供的ascend支持代码皆为参考样例,商业使用请通过官方正式途径沟通,谢谢。
\ No newline at end of file
# Copyright 2024 Bytedance Ltd. and/or its affiliates
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Configuration file for the Sphinx documentation builder.
#
# This file only contains a selection of the most common options. For a full
# list see the documentation:
# https://www.sphinx-doc.org/en/master/usage/configuration.html
# -- Path setup --------------------------------------------------------------
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
#
# import os
# import sys
# sys.path.insert(0, os.path.abspath('.'))
# -- Project information -----------------------------------------------------
project = "verl"
copyright = "2024 ByteDance Seed Foundation MLSys Team"
author = "Guangming Sheng, Chi Zhang, Yanghua Peng, Haibin Lin"
# -- General configuration ---------------------------------------------------
# The master toctree document.
master_doc = "index"
# Add any Sphinx extension module names here, as strings. They can be
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
# ones.
extensions = [
"myst_parser",
"sphinx.ext.autodoc",
"sphinx.ext.autosummary",
"sphinx.ext.autosectionlabel",
"sphinx.ext.napoleon",
]
# Use Google style docstrings instead of NumPy docstrings.
napoleon_google_docstring = True
napoleon_numpy_docstring = False
# The suffix(es) of source filenames.
# You can specify multiple suffix as a list of string:
source_suffix = {
".rst": "restructuredtext",
".md": "markdown",
}
# Add any paths that contain templates here, relative to this directory.
templates_path = ["_templates"]
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#
# This is also used if you do content translation via gettext catalogs.
# Usually you set "language" from the command line for these cases.
language = "en"
# List of patterns, relative to source directory, that match files and
# directories to ignore when looking for source files.
# This pattern also affects html_static_path and html_extra_path.
exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
# -- Options for HTML output -------------------------------------------------
# The theme to use for HTML and HTML Help pages. See the documentation for
# a list of builtin themes.
#
html_theme = "sphinx_rtd_theme"
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ["_static"]
# Add the JavaScript file
html_js_files = [
"js/runllm-widget.js",
]
Data interface
=========================
DataProto is the interface for data exchange.
The :class:`verl.DataProto` class contains two key members:
- batch: a :class:`tensordict.TensorDict` object for the actual data
- meta_info: a :class:`Dict` with additional meta information
TensorDict
~~~~~~~~~~~~
:attr:`DataProto.batch` is built on top of :class:`tensordict`, a project in the PyTorch ecosystem.
A TensorDict is a dict-like container for tensors. To instantiate a TensorDict, you must specify key-value pairs as well as the batch size.
.. code-block:: python
>>> import torch
>>> from tensordict import TensorDict
>>> tensordict = TensorDict({"zeros": torch.zeros(2, 3, 4), "ones": torch.ones(2, 3, 5)}, batch_size=[2,])
>>> tensordict["twos"] = 2 * torch.ones(2, 5, 6)
>>> zeros = tensordict["zeros"]
>>> tensordict
TensorDict(
fields={
ones: Tensor(shape=torch.Size([2, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False),
twos: Tensor(shape=torch.Size([2, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False),
zeros: Tensor(shape=torch.Size([2, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)},
batch_size=torch.Size([2]),
device=None,
is_shared=False)
One can also index a tensordict along its batch_size. The contents of the TensorDict can be manipulated collectively as well.
.. code-block:: python
>>> tensordict[..., :1]
TensorDict(
fields={
ones: Tensor(shape=torch.Size([1, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False),
twos: Tensor(shape=torch.Size([1, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False),
zeros: Tensor(shape=torch.Size([1, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)},
batch_size=torch.Size([1]),
device=None,
is_shared=False)
>>> tensordict = tensordict.to("cuda:0")
>>> tensordict = tensordict.reshape(6)
For more about :class:`tensordict.TensorDict` usage, see the official tensordict_ documentation.
.. _tensordict: https://pytorch.org/tensordict/overview.html
Core APIs
~~~~~~~~~~~~~~~~~
.. autoclass:: verl.DataProto
:members: to, select, union, make_iterator, concat
GSM8K Example
=============
Introduction
------------
In this example, we train an LLM to tackle the GSM8k task.
Paper: https://arxiv.org/pdf/2110.14168
Dataset: https://huggingface.co/datasets/gsm8k
Note that the original paper mainly focuses on training a verifier (a
reward model) to solve math problems via Best-of-N sampling. In this
example, we train an RLHF agent using a rule-based reward model.
Dataset Introduction
--------------------
GSM8k is a math problem dataset. The prompt is an elementary school
problem. The LLM model is required to answer the math problem.
The training set contains 7473 samples and the test set contains 1319
samples.
**An example**
Prompt
Katy makes coffee using teaspoons of sugar and cups of water in the
ratio of 7:13. If she used a total of 120 teaspoons of sugar and cups
of water, calculate the number of teaspoonfuls of sugar she used.
Solution
The total ratio representing the ingredients she used to make the
coffee is 7+13 = <<7+13=20>>20 Since the fraction representing the
number of teaspoons she used is 7/20, she used 7/20\ *120 =
<<7/20*\ 120=42>>42 #### 42
Step 1: Prepare dataset
-----------------------
.. code:: bash
cd examples/data_preprocess
python3 gsm8k.py --local_dir ~/data/gsm8k
Step 2: Download Model
----------------------
There're three ways to prepare the model checkpoints for post-training:
- Download the required models from huggingface or modelscope
.. code:: bash
huggingface-cli download deepseek-ai/deepseek-math-7b-instruct --local-dir ~/models/deepseek-math-7b-instruct --local-dir-use-symlinks False
# or
modelscope download --model deepseek-ai/deepseek-math-7b-instruct --local_dir ~/models/deepseek-math-7b-instruct
- Already store your store model in the local directory or HDFS path.
- Also, you can directly use the model name in huggingface (e.g.,
deepseek-ai/deepseek-math-7b-instruct) in
``actor_rollout_ref.model.path`` and ``critic.model.path`` field in
the run script. You can also download models from modelscope by setting environmental variable ``VERL_USE_MODELSCOPE=True``.
See examples/ppo_trainer/run_deepseek7b_llm_modelscope.sh for example.
Noted that users should prepare checkpoints for actor, critic and reward
model.
[Optional] Step 3: SFT your Model
---------------------------------
We provide a SFT Trainer using PyTorch FSDP in
`fsdp_sft_trainer.py <https://github.com/volcengine/verl/blob/main/verl/trainer/fsdp_sft_trainer.py>`_.
Users can customize their own SFT
script using our FSDP SFT Trainer.
We also provide various training scripts for SFT on GSM8K dataset in `gsm8k sft directory <https://github.com/volcengine/verl/blob/main/examples/sft/gsm8k/>`_.
.. code:: shell
set -x
torchrun -m verl.trainer.fsdp_sft_trainer \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.prompt_key=question \
data.response_key=answer \
data.micro_batch_size_per_gpu=8 \
model.partial_pretrain=deepseek-ai/deepseek-coder-6.7b-instruct \
trainer.default_hdfs_dir=hdfs://user/verl/experiments/gsm8k/deepseek-coder-6.7b-instruct/ \
trainer.project_name=gsm8k-sft \
trainer.experiment_name=gsm8k-sft-deepseek-coder-6.7b-instruct \
trainer.total_epochs=4 \
trainer.logger=['console','wandb']
If you use AMD GPUs (ROCm kernel), you need to add the following environment variables into the run script:
.. code-block:: bash
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export ROCR_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
export CUDA_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
Step 4: Perform PPO training with your model on GSM8K Dataset
-------------------------------------------------------------
- Prepare your own run.sh script. Here's an example for GSM8k dataset
and deepseek-llm-7b-chat model.
- Users could replace the ``data.train_files`` ,\ ``data.val_files``,
``actor_rollout_ref.model.path`` and ``critic.model.path`` based on
their environment.
- See :doc:`config` for detailed explanation of each config field.
**Reward Model/Function**
We use a rule-based reward model. We force the model to produce a final
answer following 4 “#” as shown in the solution. We extract the final
answer from both the solution and model's output using regular
expression matching. We compare them and assign a reward of 1 to correct
answer, 0.1 to incorrect answer and 0 to no answer.
**Training Script**
The training script example for FSDP and Megatron-LM backend are stored in examples/ppo_trainer directory.
.. code:: bash
cd ../ppo_trainer
bash run_deepseek7b_llm.sh
The script of run_deepseek7b_llm.sh
.. code:: bash
set -x
python3 -m verl.trainer.main_ppo \
data.train_files=$HOME/data/gsm8k/train.parquet \
data.val_files=$HOME/data/gsm8k/test.parquet \
data.train_batch_size=1024 \
data.max_prompt_length=512 \
data.max_response_length=512 \
actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.ppo_mini_batch_size=256 \
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
critic.optim.lr=1e-5 \
critic.model.use_remove_padding=True \
critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
critic.model.enable_gradient_checkpointing=True \
critic.ppo_micro_batch_size_per_gpu=32 \
critic.model.fsdp_config.param_offload=False \
critic.model.fsdp_config.optimizer_offload=False \
algorithm.kl_ctrl.kl_coef=0.001 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='verl_example_gsm8k' \
trainer.experiment_name='deepseek_llm_7b_function_rm' \
trainer.n_gpus_per_node=8 \
trainer.nnodes=1 \
trainer.save_freq=-1 \
trainer.test_freq=1 \
trainer.total_epochs=15 $@
If you use AMD GPUs (ROCm kernel), you need to add the following environment variables into the run script:
.. code-block:: bash
export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
export ROCR_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
export CUDA_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
If you encounter any issues in using AMD GPUs running VeRL, feel free to contact me - `Yusheng Su <https://yushengsu-thu.github.io/>`_.
\ No newline at end of file
Multi-Modal Example Architecture
=================================
Introduction
------------
Now, verl has supported multi-modal training. You can use fsdp and
vllm/sglang to start a multi-modal RL task. Megatron supports is also
on the way.
Follow the steps below to quickly start a multi-modal RL task.
Step 1: Prepare dataset
-----------------------
.. code:: python
# it will be saved in the $HOME/data/geo3k folder
python examples/data_preprocess/geo3k.py
Step 2: Download Model
----------------------
.. code:: bash
# download the model from huggingface
python3 -c "import transformers; transformers.pipeline(model='Qwen/Qwen2.5-VL-7B-Instruct')"
Step 3: Perform GRPO training with multi-modal model on Geo3K Dataset
---------------------------------------------------------------------
.. code:: bash
# run the task
bash examples/grpo_trainer/run_qwen2_5_vl-7b.sh
PPO Example Architecture
========================
Let's start with the Proximal Policy Optimization algorithm, which is
most widely used algorithm in LLM post-training.
The main entry point of the PPO algorithm example is:
`main_ppo.py <https://github.com/volcengine/verl/blob/main/verl/trainer/main_ppo.py>`_.
In this tutorial, we will go through the code architecture in `main_ppo.py <https://github.com/volcengine/verl/blob/main/verl/trainer/main_ppo.py>`_.
Define the data
---------------
Users need to preprocess and store the dataset in parquet files.
And we implement `RLHFDataset` to load and tokenize the parquet files.
For ``RLHFDataset`` (Default), at least 1 fields are required:
- ``prompt``: Contains the string prompt
We already provide some examples of processing the datasets to parquet
files in `data_preprocess directory <https://github.com/volcengine/verl/blob/main/examples/data_preprocess>`_. Currently, we support
preprocess of GSM8k, MATH, Hellasage, Full_hh_rlhf datasets. See :doc:`../preparation/prepare_data` for
more information.
Define the reward functions for different datasets
--------------------------------------------------
In this main entry point, the users only need to define their own reward
function based on the datasets (or applications) utilized in PPO
training.
For example, we already provide reward functions for `GSM8k <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score/gsm8k.py>`_
and `MATH <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score/math.py>`_
datasets in the ``_select_rm_score_fn``. In the ``RewardManager``, we
will compute the reward score based on the data_source to select
corresponding reward functions. For some RLHF datasets (e.g.,
full_hh_rlhf), the reward model is utilized to assess the responses
without any reward functions. In this case, the ``RewardManager`` will
return the ``rm_score`` computed by the reward model directly.
See `reward functions <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score>`_ for detailed implementation.
Define worker classes
---------------------
.. code:: python
if config.actor_rollout_ref.actor.strategy == 'fsdp': # for FSDP backend
assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
from verl.workers.fsdp_workers import ActorRolloutRefWorker, CriticWorker
from verl.single_controller.ray import RayWorkerGroup
ray_worker_group_cls = RayWorkerGroup
elif config.actor_rollout_ref.actor.strategy == 'megatron': # for Megatron backend
assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
from verl.workers.megatron_workers import ActorRolloutRefWorker, CriticWorker
from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
ray_worker_group_cls = NVMegatronRayWorkerGroup # Ray worker class for Megatron-LM
else:
raise NotImplementedError
from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
role_worker_mapping = {
Role.ActorRollout: ActorRolloutRefWorker,
Role.Critic: CriticWorker,
Role.RefPolicy: ActorRolloutRefWorker
}
global_pool_id = 'global_pool'
resource_pool_spec = {
global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
}
mapping = {
Role.ActorRollout: global_pool_id,
Role.Critic: global_pool_id,
Role.RefPolicy: global_pool_id,
}
Step 1: Construct the mapping between roles and workers
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
A role represents a group of workers in the same process. We have
pre-defined several roles in `ray_trainer.py <https://github.com/volcengine/verl/blob/main/verl/trainer/ppo/ray_trainer.py#L38>`_.
.. code:: python
class Role(Enum):
"""
To create more roles dynamically, you can subclass Role and add new members
"""
Actor = 0 # This worker only has Actor
Rollout = 1 # This worker only has Rollout
ActorRollout = 2 # This worker has both actor and rollout, it's a HybridEngine
Critic = 3 # This worker only has critic
RefPolicy = 4 # This worker only has reference policy
RewardModel = 5 # This worker only has reward model
ActorRolloutRef = 6 # This worker contains actor, rollout and reference policy simultaneously
Step 2: Define the worker class corresponding to this role
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- We have pre-implemented the ``ActorRolloutRefWorker``. Through
different configs, it can be a standalone actor, a standalone rollout,
an ActorRollout HybridEngine, or an ActorRolloutRef HybridEngine
- We also pre-implemented workers for ``Actor``, ``Rollout``,
``Critic``, ``Reward Model`` and ``Reference model`` on two different
backend: PyTorch FSDP
and Megatron-LM.
See `FSDP Workers <https://github.com/volcengine/verl/blob/main/verl/workers/fsdp_workers.py>`_
and `Megatron-LM Workers <https://github.com/volcengine/verl/blob/main/verl/workers/megatron_workers.py>`_
for more information.
Step 3: Define resource pool id and resource pool spec
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Resource pool is a division of global GPU resources,
``resource_pool_spec`` is a dict, mapping from id to # of GPUs
- In the above example, we defined a global resource pool:
global_pool_id, and then put all roles on this one resource pool
with all the GPUs in this post-training task. This refers to
*co-locate* placement where all the models share the same set of
GPUs.
- See resource pool and placement for advance usage.
Defining reward model/function
------------------------------
.. code:: python
# we should adopt a multi-source reward function here
# - for rule-based rm, we directly call a reward score
# - for model-based rm, we call a model
# - for code related prompt, we send to a sandbox if there are test cases
# - finally, we combine all the rewards together
# - The reward type depends on the tag of the data
if config.reward_model.enable:
from verl.workers.fsdp_workers import RewardModelWorker
role_worker_mapping[Role.RewardModel] = RewardModelWorker
mapping[Role.RewardModel] = global_pool_id
reward_fn = RewardManager(tokenizer=tokenizer, num_examine=0)
# Note that we always use function-based RM for validation
val_reward_fn = RewardManager(tokenizer=tokenizer, num_examine=1)
resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
Since not all tasks use model-based RM, users need to define here
whether it's a model-based RM or a function-based RM
- If it's a model-based RM, directly add the ``RewardModel`` role in the
resource mapping and add it to the resource pool mapping.
- Note that the pre-defined ``RewardModelWorker`` only supports models
with the structure of huggingface
``AutoModelForSequenceClassification``. If it's not this model, you
need to define your own RewardModelWorker in `FSDP Workers <https://github.com/volcengine/verl/blob/main/verl/workers/fsdp_workers.py>`_
and `Megatron-LM Workers <https://github.com/volcengine/verl/blob/main/verl/workers/megatron_workers.py>`_.
- If it's a function-based RM, the users are required to classified the
reward function for each datasets.
.. code:: python
def _select_rm_score_fn(data_source):
if data_source == 'openai/gsm8k':
return gsm8k.compute_score
elif data_source == 'lighteval/MATH':
return math.compute_score
else:
raise NotImplementedError
See reward functions implemented in `directory <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score/>`_
for more information.
Define, init and run the PPO Trainer
------------------------------------
.. code:: python
trainer = RayPPOTrainer(config=config,
tokenizer=tokenizer,
role_worker_mapping=role_worker_mapping,
resource_pool_manager=resource_pool_manager,
ray_worker_group_cls=ray_worker_group_cls,
reward_fn=reward_fn,
val_reward_fn=val_reward_fn)
trainer.init_workers()
trainer.fit()
- We first initialize the ``RayPPOTrainer`` with user config, tokenizer
and all the above worker mapping, resource pool, worker group and
reward functions
- We first call the ``trainer.init_workers()`` to initialize the models
on the allocated GPUs (in the resource pool)
- The actual PPO training will be executed in ``trainer.fit()``
verl can be easily extended to other RL algorithms by reusing the Ray
model workers, resource pool and reward functions. See :doc:`extension<../advance/dpo_extension>` for
more information.
Details of the ``RayPPOTrainer`` is discussed in :doc:`Ray Trainer<../workers/ray_trainer>`.
Sandbox Fusion Example
============================
Introduction
------------
Sandbox Fusion is a remote code sandbox service that provides a secure environment for running and evaluating code generated by Large Language Models (LLMs). This example demonstrates how to train an LLM and use Sandbox Fusion to verify generated code, enhancing both security and performance.
By leveraging a remote code sandbox service with greater CPU resources for concurrent code verification, you can reduce the reward stage time by 10-30%, depending on the quality of the generated code.
Step 1: Prepare the Dataset
---------------------------
We use the Eurus-2-RL-Data dataset for training. This dataset combines math and code questions, making it suitable for LLM training tasks. You can download it from HuggingFace: `Eurus-2-RL-Data Dataset <https://huggingface.co/datasets/PRIME-RL/Eurus-2-RL-Data>`_.
Step 2: Set Up the Sandbox Fusion Service
-----------------------------------------
Sandbox Fusion is a remote code sandbox service designed to securely run and evaluate LLM-generated code. To use it:
1. **Access Full Documentation**: For detailed setup instructions, refer to the `Sandbox Fusion Documentation <https://bytedance.github.io/SandboxFusion/>`_.
2. **Deploy the Service**: Choose one of the following deployment methods:
- **Local Deployment**: Follow the guide `here <https://bytedance.github.io/SandboxFusion/docs/docs/get-started#local-deployment>`_.
- **FaaS Instance (Volcengine)**: Create an instance using the `Volcengine Documentation <https://www.volcengine.com/docs/6662/1539235>`_.
After deployment, you will receive an API endpoint in the format: ``https://<ip-address-or-domain-name>/run_code``.
Step 3: Configure the Training Script
-------------------------------------
To integrate Sandbox Fusion into your training script, configure the following parameters:
**Key Settings for Sandbox Fusion**
- ``reward_model.sandbox_fusion.url='<API-endpoint>'``: Enable Sandbox Fusion by specifying the API endpoint (must end with ``/run_code``).
- ``reward_model.sandbox_fusion.max_concurrent=256``: Set the maximum number of concurrent API requests to the Sandbox Fusion service.
**Additional Optimization**
To further reduce code verification time, enable parallel processing with:
- ``reward_model.reward_manager=prime``: The Prime reward manager verifies code across multiple subprocesses concurrently.
**Example Script**
For a practical implementation, refer to the example script:
``examples/ppo_trainer/run_deepseek7b_llm_sandbox_fusion.sh``
Once you’ve set your API endpoint in the script, you can start the training job.
\ No newline at end of file
.. _algo-baseline-page:
Algorithm Baselines
===================
GSM8k
------------------
Assuming GSM8k dataset is preprocess via ``python3 examples/data_preprocess/gsm8k.py``
Refer to the table below to reproduce PPO training from different pre-trained models.
.. _Huggingface: https://huggingface.co/google/gemma-2-2b-it#benchmark-results
.. _SFT Command and Logs: https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/gemma-2-2b-it-sft-0.411.log
.. _SFT+PPO Command and Logs: https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/gemma-2-2b-it-ppo-bsz512_4-prompt1024-resp-512-0.640.log
.. _wandb: https://api.wandb.ai/links/verl-team/h7ux8602
.. _Qwen Blog: https://qwenlm.github.io/blog/qwen2.5-llm/
.. _PPO Command and Logs: https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log
.. _Megatron PPO Command and Logs: https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/deepseek-llm-7b-chat-megatron-bsz256_4-prompt512-resp512-0.695.log
.. _Qwen7b GRPO Script: https://github.com/volcengine/verl/blob/a65c9157bc0b85b64cd753de19f94e80a11bd871/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh
.. _Megatron wandb: https://wandb.ai/verl-team/verl_megatron_gsm8k_examples/runs/10fetyr3
.. _Qwen7b ReMax Script: https://github.com/eric-haibin-lin/verl/blob/main/examples/remax_trainer/run_qwen2.5-3b_seq_balance.sh
.. _Qwen7b ReMax Wandb: https://wandb.ai/liziniu1997/verl_remax_example_gsm8k/runs/vxl10pln
+----------------------------------+------------------------+------------+-----------------------------------------------------------------------------------------------+
| Model | Method | Test score | Details |
+==================================+========================+============+=====================+=========================================================================+
| google/gemma-2-2b-it | pretrained checkpoint | 23.9 | `Huggingface`_ |
+----------------------------------+------------------------+------------+-----------------------------------------------------------------------------------------------+
| google/gemma-2-2b-it | SFT | 52.06 | `SFT Command and Logs`_ |
+----------------------------------+------------------------+------------+-----------------------------------------------------------------------------------------------+
| google/gemma-2-2b-it | SFT + PPO | 64.02 | `SFT+PPO Command and Logs`_, `wandb`_ |
+----------------------------------+------------------------+------------+-----------------------------------------------------------------------------------------------+
| Qwen/Qwen2.5-0.5B-Instruct | pretrained checkpoint | 36.4 | `Qwen Blog`_ |
+----------------------------------+------------------------+------------+-----------------------------------------------------------------------------------------------+
| Qwen/Qwen2.5-0.5B-Instruct | PPO | 56.7 | `PPO Command and Logs`_ |
+----------------------------------+------------------------+------------+-----------------------------------------------------------------------------------------------+
| deepseek-ai/deepseek-llm-7b-chat | PPO | 69.5 [1]_ | `Megatron PPO Command and Logs`_, `Megatron wandb`_ |
+----------------------------------+------------------------+------------+-----------------------------------------------------------------------------------------------+
| Qwen/Qwen2-7B-Instruct | GRPO | 89 | `Qwen7b GRPO Script`_ |
+----------------------------------+------------------------+------------+-----------------------------------------------------------------------------------------------+
| Qwen/Qwen2.5-7B-Instruct | ReMax | 97 | `Qwen7b ReMax Script`_, `Qwen7b ReMax Wandb`_ |
+----------------------------------+------------------------+------------+-----------------------------------------------------------------------------------------------+
.. [1] During the evaluation, we have only extracted answers following the format "####". A more flexible answer exaction, longer response length and better prompt engineering may lead to higher score.
Frequently Asked Questions
====================================
Ray related
------------
How to add breakpoint for debugging with distributed Ray?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Please checkout the official debugging guide from Ray: https://docs.ray.io/en/latest/ray-observability/ray-distributed-debugger.html
"Unable to register worker with raylet"
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The cause of this issue is due to some system setting, e.g., SLURM added some constraints on how the CPUs are shared on a node.
While `ray.init()` tries to launch as many worker processes as the number of CPU cores of the machine,
some constraints of SLURM restricts the `core-workers` seeing the `raylet` process, leading to the problem.
To fix this issue, you can set the config term ``ray_init.num_cpus`` to a number allowed by your system.
Distributed training
------------------------
How to run multi-node post-training with Ray?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
You can start a ray cluster and submit a ray job, following the official guide from Ray: https://docs.ray.io/en/latest/ray-core/starting-ray.html
Then in the configuration, set the ``trainer.nnode`` config to the number of machines for your job.
How to use verl on a Slurm-managed cluster?
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Ray provides users with `this <https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html>`_ official
tutorial to start a Ray cluster on top of Slurm. We have verified the :doc:`GSM8K example<../examples/gsm8k_example>`
on a Slurm cluster under a multi-node setting with the following steps.
1. [Optional] If your cluster support `Apptainer or Singularity <https://apptainer.org/docs/user/main/>`_ and you wish
to use it, convert verl's Docker image to an Apptainer image. Alternatively, set up the environment with the package
manager available on your cluster or use other container runtimes (e.g. through `Slurm's OCI support <https://slurm.schedmd.com/containers.html>`_) available to you.
.. code:: bash
apptainer pull /your/dest/dir/vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3.sif docker://verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3
2. Follow :doc:`GSM8K example<../examples/gsm8k_example>` to prepare the dataset and model checkpoints.
3. Modify `examples/slurm/ray_on_slurm.slurm <https://github.com/volcengine/verl/blob/main/examples/slurm/ray_on_slurm.slurm>`_ with your cluster's own information.
4. Submit the job script to the Slurm cluster with `sbatch`.
Please note that Slurm cluster setup may vary. If you encounter any issues, please refer to Ray's
`Slurm user guide <https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html>`_ for common caveats.
If you changed Slurm resource specifications, please make sure to update the environment variables in the job script if necessary.
Install related
------------------------
NotImplementedError: TensorDict does not support membership checks with the `in` keyword.
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Detail error information:
.. code:: bash
NotImplementedError: TensorDict does not support membership checks with the `in` keyword. If you want to check if a particular key is in your TensorDict, please use `key in tensordict.keys()` instead.
Cause of the problem: There is no suitable version of tensordict package for the linux-arm64 platform. The confirmation method is as follows:
.. code:: bash
pip install tensordict==0.6.2
Output example:
.. code:: bash
ERROR: Could not find a version that satisfies the requirement tensordict==0.6.2 (from versions: 0.0.1a0, 0.0.1b0, 0.0.1rc0, 0.0.2a0, 0.0.2b0, 0.0.3, 0.1.0, 0.1.1, 0.1.2, 0.8.0, 0.8.1, 0.8.2, 0.8.3)
ERROR: No matching distribution found for tensordict==0.6.2
Solution 1st:
Install tensordict from source code:
.. code:: bash
pip uninstall tensordict
git clone https://github.com/pytorch/tensordict.git
cd tensordict/
git checkout v0.6.2
python setup.py develop
pip install -v -e .
Solution 2nd:
Temperally modify the error takeplace codes: tensordict_var -> tensordict_var.keys()
Illegal memory access
---------------------------------
If you encounter the error message like ``CUDA error: an illegal memory access was encountered`` during rollout, most likely it is due to a known issue from vllm(<=0.6.3).
Please set the following environment variable. The env var must be set before the ``ray start`` command if any.
.. code:: bash
export VLLM_ATTENTION_BACKEND=XFORMERS
If in doubt, print this env var in each rank to make sure it is properly set.
Checkpoints
------------------------
If you want to convert the model checkpoint into huggingface safetensor format, please refer to ``scripts/model_merger.py``.
Triton ``compile_module_from_src`` error
------------------------------------------------
If you encounter triton compilation error similar to the stacktrace below, please set the ``use_torch_compile`` flag according to
https://verl.readthedocs.io/en/latest/examples/config.html to disable just-in-time compilation for fused kernels.
.. code:: bash
File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/jit.py", line 345, in <lambda>
return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 338, in run
return self.fn.run(*args, **kwargs)
File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/jit.py", line 607, in run
device = driver.active.get_current_device()
File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/driver.py", line 23, in __getattr__
self._initialize_obj()
File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/driver.py", line 20, in _initialize_obj
self._obj = self._init_fn()
File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/driver.py", line 9, in _create_driver
return actives[0]()
File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/backends/nvidia/driver.py", line 371, in __init__
self.utils = CudaUtils() # TODO: make static
File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/backends/nvidia/driver.py", line 80, in __init__
mod = compile_module_from_src(Path(os.path.join(dirname, "driver.c")).read_text(), "cuda_utils")
File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/backends/nvidia/driver.py", line 57, in compile_module_from_src
so = _build(name, src_path, tmpdir, library_dirs(), include_dir, libraries)
File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/build.py", line 48, in _build
ret = subprocess.check_call(cc_cmd)
File "/data/lbh/conda_envs/verl/lib/python3.10/subprocess.py", line 369, in check_call
raise CalledProcessError(retcode, cmd)
What is the meaning of train batch size, mini batch size, and micro batch size?
------------------------------------------------------------------------------------------
This figure illustrates the relationship between different batch size configurations.
https://excalidraw.com/#json=pfhkRmiLm1jnnRli9VFhb,Ut4E8peALlgAUpr7E5pPCA
.. image:: https://github.com/user-attachments/assets/16aebad1-0da6-4eb3-806d-54a74e712c2d
How to generate ray timeline to analyse performance of a training job?
------------------------------------------------------------------------------------------
To generate the ray timeline file, you can set the config term ``ray_init.timeline_file`` to a json file path.
For example:
.. code:: bash
ray_init.timeline_file=/tmp/ray_timeline.json
The file will be generated in the specified path at the end of a training job.
You can use tools like chrome://tracing or the Perfetto UI and view the ray timeline file.
This figure shows the ray timeline file generated by from a training job on 1 node with 4 GPUs
.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray_timeline.png?raw=true
How to set proxy only for wandb?
------------------------------------------------------------------------------------------
If you need a proxy to access wandb, you can add below config in your training job script.
Comparing to using global https_proxy env variable, this approach won't mess up other http requests, such as ChatCompletionScheduler.
.. code:: bash
+trainer.wandb_proxy=http://<your proxy and port>
Welcome to verl's documentation!
================================================
verl is a flexible, efficient and production-ready RL training framework designed for large language models (LLMs) post-training. It is an open source implementation of the `HybridFlow <https://arxiv.org/pdf/2409.19256>`_ paper.
verl is flexible and easy to use with:
- **Easy extension of diverse RL algorithms**: The hybrid programming model combines the strengths of single-controller and multi-controller paradigms to enable flexible representation and efficient execution of complex Post-Training dataflows. Allowing users to build RL dataflows in a few lines of code.
- **Seamless integration of existing LLM infra with modular APIs**: Decouples computation and data dependencies, enabling seamless integration with existing LLM frameworks, such as PyTorch FSDP, Megatron-LM, vLLM and SGLang. Moreover, users can easily extend to other LLM training and inference frameworks.
- **Flexible device mapping and parallelism**: Supports various placement of models onto different sets of GPUs for efficient resource utilization and scalability across different cluster sizes.
- Ready integration with popular HuggingFace models
verl is fast with:
- **State-of-the-art throughput**: By seamlessly integrating existing SOTA LLM training and inference frameworks, verl achieves high generation and training throughput.
- **Efficient actor model resharding with 3D-HybridEngine**: Eliminates memory redundancy and significantly reduces communication overhead during transitions between training and generation phases.
--------------------------------------------
.. _Contents:
.. toctree::
:maxdepth: 2
:caption: Quickstart
start/install
start/quickstart
start/multinode
start/ray_debug_tutorial
.. toctree::
:maxdepth: 2
:caption: Programming guide
hybrid_flow
single_controller
.. toctree::
:maxdepth: 1
:caption: Data Preparation
preparation/prepare_data
preparation/reward_function
.. toctree::
:maxdepth: 2
:caption: Configurations
examples/config
.. toctree::
:maxdepth: 1
:caption: PPO Example
examples/ppo_code_architecture
examples/gsm8k_example
examples/multi_modal_example
.. toctree::
:maxdepth: 1
:caption: Algorithms
algo/ppo.md
algo/grpo.md
algo/dapo.md
algo/spin.md
algo/sppo.md
algo/opo.md
algo/baseline.md
.. toctree::
:maxdepth: 1
:caption: PPO Trainer and Workers
workers/ray_trainer
workers/fsdp_workers
workers/megatron_workers
workers/sglang_worker
.. toctree::
:maxdepth: 1
:caption: Performance Tuning Guide
perf/dpsk.md
perf/perf_tuning
README_vllm0.8.md
perf/device_tuning
.. toctree::
:maxdepth: 1
:caption: Adding new models
advance/fsdp_extension
advance/megatron_extension
.. toctree::
:maxdepth: 1
:caption: Advanced Features
advance/checkpoint
advance/rope
advance/ppo_lora.rst
sglang_multiturn/multiturn.rst
advance/placement
advance/dpo_extension
examples/sandbox_fusion_example
.. toctree::
:maxdepth: 1
:caption: API References
api/data
api/single_controller.rst
api/trainer.rst
api/utils.rst
.. toctree::
:maxdepth: 2
:caption: FAQ
faq/faq
Contribution
-------------
verl is free software; you can redistribute it and/or modify it under the terms
of the Apache License 2.0. We welcome contributions.
Join us on `GitHub <https://github.com/volcengine/verl>`_, `Slack <https://join.slack.com/t/verlgroup/shared_invite/zt-2w5p9o4c3-yy0x2Q56s_VlGLsJ93A6vA>`_ and `Wechat <https://raw.githubusercontent.com/eric-haibin-lin/verl-community/refs/heads/main/WeChat.JPG>`_ for discussions.
Contributions from the community are welcome! Please check out our `project roadmap <https://github.com/volcengine/verl/issues/710>`_ and `good first issues <https://github.com/volcengine/verl/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22>`_ to see where you can contribute.
Code Linting and Formatting
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
We use pre-commit to help improve code quality. To initialize pre-commit, run:
.. code-block:: bash
pip install pre-commit
pre-commit install
To resolve CI errors locally, you can also manually run pre-commit by:
.. code-block:: bash
pre-commit run
Adding CI tests
^^^^^^^^^^^^^^^^^^^^^^^^
If possible, please add CI test(s) for your new feature:
1. Find the most relevant workflow yml file, which usually corresponds to a ``hydra`` default config (e.g. ``ppo_trainer``, ``ppo_megatron_trainer``, ``sft_trainer``, etc).
2. Add related path patterns to the ``paths`` section if not already included.
3. Minimize the workload of the test script(s) (see existing scripts for examples).
We are HIRING! Send us an `email <mailto:haibin.lin@bytedance.com>`_ if you are interested in internship/FTE opportunities in MLSys/LLM reasoning/multimodal alignment.
Resource Needed for verl RL(LoRA)
==============================
Since RL requires more resources compared to regular training,
determining how much resources are needed to successfully run it before training
is a relatively difficult task. To provide more people with reference points for
resource selection when dealing with different models and tasks, this section is
mainly dedicated to introducing the environmental requirements based on experiments
we have conducted.
However, due to limited staff and equipment resources, we also hope for more
contributions from the open-source community. When submitting a PR, it is necessary
to provide a script to be added to the example/tuning scripts.
We need two types of scripts: one is the configuration that can run with the **minimum
resources(min)**, and the other is the configuration that runs with **recommended resources(recommended)**. For the former,
it can be understood as a script that can run after applying all memory optimization techniques
(e.g., offload, gradient checkpointing). For the latter, it can be understood as a script that
can run while avoiding operations that incur additional time overhead as much as possible (targetting best throughput).
When defining script names, please follow this format:
``[model]_[task]_[gpunums]_[device]_[train]_[infer].sh``. This will effectively improve
the script's recognizability. You can place the script under the ``examples/tuning/`` directory.
If you happen to have a configuration that has already been tested, we welcome you to submit
a PR and include a screenshot from Wandb or other verifiable evidence.
----------------------------------------
0.5B
~~~
.. list-table::
:widths: auto
:header-rows: 1
* - Tag
- Model
- Task
- Resource
- MaxBatch
- Train
- Infer
- Link
- Contributor
* - MIN
- Qwen2.5-0.5B
- GRPO-LoRA
- 1*H100
- 116
- fsdp
- vllm0.8.3
- `qwen2-0.5b_grpo-lora_1_h100_fsdp_vllm.sh <https://github.com/volcengine/verl/blob/main/examples/tuning/0.5b/qwen2-0.5b_grpo-lora_1_h100_fsdp_vllm.sh>`_
- `SimonHuang <thelongestusernameofall@gmail.com>`_
1.5B
~~~
.. list-table::
:widths: auto
:header-rows: 1
* - Tag
- Model
- Task
- Resource
- MaxBatch
- Train
- Infer
- Link
- Contributor
* - MIN
- Qwen2.5-1.5B
- GRPO-LoRA
- 1*H100
- 128
- fsdp
- vllm0.8.3
- `qwen2-1.5b_grpo-lora_1_h100_fsdp_vllm.sh <https://github.com/volcengine/verl/blob/main/examples/tuning/1.5b/qwen2-1.5b_grpo-lora_1_h100_fsdp_vllm.sh>`_
- `SimonHuang <thelongestusernameofall@gmail.com>`_
3B
~~~
.. list-table::
:widths: auto
:header-rows: 1
* - Tag
- Model
- Task
- Resource
- MaxBatch
- Train
- Infer
- Link
- Contributor
* - MIN
- Qwen2.5-3B
- GRPO-LoRA
- 1*H100
- 62
- fsdp
- vllm0.8.3
- `qwen2-3b_grpo-lora_1_h100_fsdp_vllm.sh <https://github.com/volcengine/verl/blob/main/examples/tuning/3b/qwen2-3b_grpo-lora_1_h100_fsdp_vllm.sh>`_
- `SimonHuang <thelongestusernameofall@gmail.com>`_
7B
~~~
.. list-table::
:widths: auto
:header-rows: 1
* - Tag
- Model
- Task
- Resource
- MaxBatch
- Train
- Infer
- Link
- Contributor
* - MIN
- Qwen2-7B
- GRPO
- 2*H800
- \
- fsdp
- vllm0.8.2
- `qwen2-7b_grpo_2_h800_fsdp_vllm <https://github.com/volcengine/verl/blob/main/examples/tuning/7b/qwen2-7b_grpo_2_h800_fsdp_vllm.sh>`_
- `Xiangyongan <xiangyongan@bytedance.com>`_
* - MIN
- Qwen2.5-7B
- GRPO-LoRA
- 1*H100
- 16
- fsdp
- vllm0.8.3
- `qwen2-7b_grpo-lora_1_h100_fsdp_vllm.sh <https://github.com/volcengine/verl/blob/main/examples/tuning/7b/qwen2-7b_grpo-lora_1_h100_fsdp_vllm.sh>`_
- `SimonHuang <thelongestusernameofall@gmail.com>`_
14B
~~~
.. list-table::
:widths: auto
:header-rows: 1
* - Tag
- Model
- Task
- Resource
- MaxBatch
- Train
- Infer
- Link
- Contributor
* - MIN
- Qwen2-14B
- GRPO
- 4*H800
- \
- fsdp
- vllm0.8.2
- `qwen2-14b_grpo_4_h800_fsdp_vllm <https://github.com/volcengine/verl/blob/main/examples/tuning/14b/qwen2-14b_grpo_4_h800_fsdp_vllm.sh>`_
- `Xiangyongan <xiangyongan@bytedance.com>`_
* - MIN
- Qwen2.5-14B
- GRPO-LoRA
- 2*H100
- 116
- fsdp
- vllm0.8.3
- `qwen2-14b_grpo-lora_2_h100_fsdp_vllm.sh <https://github.com/volcengine/verl/blob/main/examples/tuning/14b/qwen2-14b_grpo-lora_2_h100_fsdp_vllm.sh>`_
- `SimonHuang <thelongestusernameofall@gmail.com>`_
32B
~~~
.. list-table::
:widths: auto
:header-rows: 1
* - Tag
- Model
- Task
- Resource
- MaxBatch
- Train
- Infer
- Link
- Contributor
* - MIN
- Qwen2-32B
- GRPO
- 8*H20
- \
- megatron
- vllm0.8.2
- `qwen2-32b_grpo_8_h20_megatron_vllm <https://github.com/volcengine/verl/tree/main/examples/tuning/32b/qwen2_32B_grpo_8_h20_megatron_vllm.sh>`_
- `Xiangyongan <xiangyongan@bytedance.com>`_
* - MIN
- Qwen2.5-32B
- GRPO-LoRA
- 4*H100
- 180
- fsdp
- vllm0.8.3
- `qwen2-32b_grpo-lora_4_h100_fsdp_vllm.sh <https://github.com/volcengine/verl/blob/main/examples/tuning/32b/qwen2-32b_grpo-lora_4_h100_fsdp_vllm.sh>`_
- `SimonHuang <thelongestusernameofall@gmail.com>`_
70B
~~~
.. list-table::
:widths: auto
:header-rows: 1
* - Tag
- Model
- Task
- Resource
- MaxBatch
- Train
- Infer
- Link
- Contributor
* - MIN
- Qwen2-70B
- GRPO
- 32*H20
- \
- fsdp
- vllm0.8.2
- `qwen2-70b_grpo_32_h20_fsdp_vllm <https://github.com/volcengine/verl/blob/main/examples/tuning/70b/qwen2-70b_grpo_32_h20_fsdp_vllm.sh>`_
- `Xiangyongan <xiangyongan@bytedance.com>`_
* - MIN
- Qwen2-70B
- GRPO
- 32*H800
- \
- fsdp
- vllm0.8.3
- `qwen2-70b_grpo_32_h800_fsdp_vllm <https://github.com/volcengine/verl/blob/main/examples/tuning/70b/qwen2-70b_grpo_32_h800_fsdp_vllm.sh>`_
- `Xiangyongan <xiangyongan@bytedance.com>`_
* - MIN
- Qwen2.5-72B
- GRPO-LoRA
- 8*H100
- 176
- fsdp
- vllm0.8.3
- `qwen2-72b_grpo-lora_8_h100_fsdp_vllm.sh <https://github.com/volcengine/verl/blob/main/examples/tuning/70b/qwen2-72b_grpo-lora_8_h100_fsdp_vllm.sh>`_
- `SimonHuang <thelongestusernameofall@gmail.com>`_
405B
~~~~
.. table::
:widths: auto
====== ====== ====== ======== ======== ====== ====== ======
tag model task resource MaxBatch train infer link
====== ====== ====== ======== ======== ====== ====== ======
\ \ \ \ \ \ \
====== ====== ====== ======== ======== ====== ====== ======
671B
~~~~
.. table::
:widths: auto
====== ====== ====== ======== ======== ====== ====== ======
tag model task resource MaxBatch train infer link
====== ====== ====== ======== ======== ====== ====== ======
\ \ \ \ \ \ \
====== ====== ====== ======== ======== ====== ====== ======
# Training DeepSeek 671b
verl integrates Megatron to support large MoE models such as `Qwen3-235B-A22B` and `deepseek-ai/DeepSeek-V3`. This is an ongoing community effort.
In the journey the community added the following features and optimizations that enable verl with larger models:
- per tensor weight resharding between rollout and training
- context parallelism and expert parallelism enabled via megatron
- dynamic batch size (sequence balance) for megatron
- reduced ray-related serialization overhead
- optimizer offloading, recomputation, and efficient kernels
- various debugging metrics and utils
and the megatron backend now has a wider list of models supported:
- DeepSeek-V3
- Moonlight
- Qwen3
- Qwen2.5-VL (to be merged soon)
- Qwen2
- Mixtral
## Getting Started
### DeepSeek 671b
The recommended image with pre-built megatron dependency is `whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.1-te2.3-deepseekv3`, built with the Dockerfile in [docker/Dockerfile.vllm.sglang.megatron.deepseek](https://github.com/volcengine/verl/blob/main/docker/Dockerfile.vllm.sglang.megatron.deepseek).
For checkpoint loading, we rely on megatron dist-ckpt for resharding. A converted dist-ckpt for DeepSeek-V3 is available from [huggingface BearBiscuit05/dpsk-v3-671B-BF16-dist_ckpt](https://huggingface.co/BearBiscuit05/dpsk-v3-671B-BF16-dist_ckpt/tree/main).
To run end-to-end training on the DAPO dataset, run [recipe/dapo/test_dapo_dspk_671b_megatron.sh](https://github.com/volcengine/verl/blob/main/recipe/dapo/test_dapo_dspk_671b_megatron.sh). It runs on 512 H20(96GB) GPUs with the following setup:
- vllm rollout with TP=32, bfloat16
- megatron training with attention DP, MoE EP=32, PP=16, bfloat16
MTP is disabled during RL training.
### Qwen3 236b
For Qwen3-236b, please refer to [examples/grpo_trainer/run_qwen3-236b_megatron.sh](https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3-236b_megatron.sh), which runs on 128 H20(96GB) GPUs.
## Upcoming Optimizations
The community continue to optimize large MoE models further, ongoing efforts include:
- further optimizing memory consumption, and provide recommended/tuned configurations with various machine types
- optimizing long context RL training performance
- performance improvement with SGLang x Megatron
We invite the community to try and improve verl together. Get connected with us on [slack](https://join.slack.com/t/verlgroup/shared_invite/zt-2w5p9o4c3-yy0x2Q56s_VlGLsJ93A6vA)/[wechat](https://raw.githubusercontent.com/eric-haibin-lin/verl-community/refs/heads/main/WeChat.JPG)/[Github issues](https://github.com/volcengine/verl/issues/708)!
## Acknowledgement
@vermouth1992 @ISEEKYAN @ETOgaosion @yzlnew @ShareLer @BearBiscuit05 @ccclyu @ann-qin-lu @SwordFaith @zzong2006 @zhaochenyang20 @ocss884 @eric-haibin-lin
Performance Tuning Guide
==============================
Author: `Guangming Sheng <https://github.com/PeterSH6>`_
In this section, we will discuss how to tune the performance of all the stages in verl, including:
1. Rollout generation throughput.
2. Enable ``use_remove_padding=True`` for sequence packing (i.e., data packing and remove padding).
3. Batch size tuning for forward and backward computation
4. Enable ``use_dynamic_bsz=True`` for higher throughput.
5. Utilize Ulysses Sequence Parallel for Long Context Training
6. LigerKernel for SFT performance optimization
Rollout Generation Tuning
--------------------------
verl currently supports two rollout backends: vLLM and TGI (with SGLang support coming soon).
Below are key factors for tuning vLLM-based rollout. Before tuning, we recommend setting ``actor_rollout_ref.rollout.disable_log_stats=False`` so that rollout statistics are logged.
- Increase ``gpu_memory_utilization``.
- For vLLM v0.5.4 and v0.6.3, the vLLM pre-allocates GPU KVCache by using gpu_memory_utilization of the **remaining** memory.
- For vLLM v0.7.0 and later, the vLLM instance will only use gpu_memory_utilization of the **total** memory.
- For SGLang, it's the fraction of the free GPU memory used for **static** memory like model weights and KV cache. However, the remaining (1-gpu_memory_utilization) will also be used during inference.
However, if model parameters and optimizer states are not offloaded, using too high a fraction can lead to OOM.
A value between 0.5 and 0.7 often strikes a good balance between high throughput and avoiding OOM.
Note: since the definition of ``gpu_memory_utilization`` varies across inference engines, a value that works well for one engine may cause OOM for another.
- Adjust ``max_num_seqs`` or ``max_num_batched_tokens``.
If the GPU cache utilization is relatively low in the log, increase ``max_num_seqs`` or ``max_num_batched_tokens``
can enlarge the effective batch size in the decoding stage, allowing more concurrent requests per batch.
We recommend setting ``max_num_batched_tokens > 2048`` for higher throughput.
- Use a smaller ``tensor_parallel_size``.
When GPU resources allow, a smaller tensor parallel size spawns more vLLM replicas.
Data parallelism (DP) can yield higher throughput than tensor parallelism (TP), but also increases KVCache consumption.
Carefully balance the trade-off between more replicas and higher memory usage.
Our experient in Sec. 8.4 of `HybridFlow paper <https://arxiv.org/pdf/2409.19256v2>`_ evaluate this trade-off.
More tuning details such as dealing with Preemption and Chunked-prefill
can be found in `vLLM official tuning guide <https://docs.vllm.ai/en/latest/performance/optimization.html>`_
The performance of vllm can be further increased if upgrading from v0.6.3 to v0.7. See https://github.com/volcengine/verl/blob/main/docs/README_vllm0.7.md for details on how to upgrade.
Enable remove padding (sequence packing)
-----------------------------------------
Currently, for llama, mistral, gemma1 and qwen based models, users can enable `use_remove_padding=True` to utilize the
sequence packing implementation provided by transformers library.
For other models, transformers library may also support it but we haven't tested it yet.
Users can add the desired model config to the `test_transformer.py <https://github.com/volcengine/verl/blob/main/tests/models/test_transformer.py#L24>`_ file.
And test its functionaility by running the following command:
.. code-block:: bash
pytest -s tests/models/test_transformer.py
If the test passes, you can add your desired model into the model `registry.py <https://github.com/volcengine/verl/blob/main/verl/models/registry.py#L24>`_ file.
Then, you can enjoy the performance boost of sequence packing
and welcome to PR your tested model to verl!
Batch Size Tuning
-----------------
To achieve higher throughput in experience preparation (i.e., model fwd) and model update (i.e., actor/critic fwd/bwd),
users may need to tune the ``*micro_batch_size_per_gpu`` for different computation.
In verl, the core principle for setting batch sizes is:
- **Algorithmic metrics** (train batch size, PPO mini-batch size) are *global* (from a single-controller perspective),
normalized in each worker. See the `normalization code <https://github.com/volcengine/verl/blob/main/verl/workers/fsdp_workers.py#L120-L122>`_.
- **Performance-related parameters** (micro batch size, max token length for dynamic batch size) are *local* parameters that define the per-GPU data allocations.
See the `normalization code <https://github.com/volcengine/verl/blob/main/verl/workers/fsdp_workers.py#L127>`_.
.. note:: In your training script, please use ``*micro_batch_size_per_gpu`` instead of ``*micro_batch_size``.
So that you don't need to consider the normalization of the ``micro_batch_size`` and ``micro_batch_size`` will be deprecated.
Batch Size Tuning tips
""""""""""""""""""""""
Therefore, users may need to tune the ``*micro_batch_size_per_gpu`` to accelerate training. Here're some tips:
1. **Enable gradient checkpointing**:
Set ``actor_rollout_ref.model.enable_gradient_checkpointing=True`` and ``critic.model.enable_gradient_checkpointing=True``.
This often allows for larger micro-batch sizes and will be beneficial for large mini-batch training.
2. Increase the ``*micro_batch_size_per_gpu`` as much as possible till equals to normalized ``mini_batch_size``.
3. **Use larger forward-only parameters**:
Forward only parameter, such as ``actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu``,
``actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu``, ``critic.forward_micro_batch_size_per_gpu`` could be larger (e.g., 2x) than training related micro batch sizes,
such as ``actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu``, ``critic.ppo_micro_batch_size_per_gpu``.
4. **Allow larger micro-batch sizes for Critic and Reward models**:
micro batch size of Critic and Reward model could be larger than Actor model. This is because the actor model has much larger vocab size in the final layer.
5. **Enable activation offloading**:
Set ``actor_rollout_ref.model.enable_activation_offload=True`` and ``critic.model.enable_activation_offload=True``.
This often works together with gradient checkpointing to get larger micro-batch sizes and it's only available in FSDP backend now.
Tuning for Dynamic Batch Size
-----------------------------
Dynamic batch size is a technique that allows the model to process similar number of tokens in a single forward pass (with different actual batch sizes).
This can significantly improve the training efficiency and reduce the memory usage.
To utilize this technique, users can set ``use_dynamic_bsz=True`` in actor, ref, critic and reward models.
With ``use_dynamic_bsz=True``, users don't need to tune ``*micro_batch_size_per_gpu``.
Instead, users should tune the following parameters:
- ``actor_rollout_ref.actor.ppo_max_token_len_per_gpu``, ``critic.ppo_max_token_len_per_gpu``:
The maximum number of tokens to be processed in fwd and bwd of ``update_policy`` and ``update_critic``.
- ``actor_rollout_ref.ref.log_prob_max_token_len_per_gpu`` and ``actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu``:
The maximum number of tokens to be processed in a the fwd computation of ``compute_log_prob`` and ``comptue_ref_log_prob``.
- ``critic.forward_micro_batch_size_per_gpu``, ``reward_model.forward_micro_batch_size_per_gpu``:
The maximum number of tokens to be processed in a the fwd computation of ``compute_values``, ``compute_rm_score``.
Dynamic Batch Size Tuning tips
""""""""""""""""""""""""""""""
Here're some tips to tune the above parameters:
1. **Increase** ``actor_rollout_ref.actor.ppo_max_token_len_per_gpu``
Make it at least 2 x (max_prompt_length + max_response_length). We set it to 3x in `run_qwen2-7b_rm_seq_balance.sh <https://github.com/volcengine/verl/blob/main/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh#L25>`_.
Try to increase it to get higher throughput.
2. **Forward-only parameters can be larger**:
Similar to the non-dynamic-batch scenario, forward-only token limits can exceed those used in forward/backward operations.
3. **Use larger limits for Critic and Reward models**:
Critic and Reward parameters can be set at least 2× the Actor’s limits. For instance, we set them to 4× here:
`run_qwen2-7b_rm_seq_balance.sh <https://github.com/volcengine/verl/blob/main/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh#L40>`_
.. :math:`\text{critic.ppo_max_token_len_per_gpu} = 2 \times \text{actor.ppo_max_token_len_per_gpu})`.
Ulysses Sequence Parallel for Long Context Training
----------------------------------------------------
To utilize this technique, users can set ``ulysses_sequence_parallel_size>1`` in actor, ref, critic and reward models.
We support different model utilize different ulysses_sequence_parallel_size sizes.
To train log sequence (>32k), users may need to decrease the ``*micro_batch_size_per_gpu`` and ``*max_token_len_per_gpu`` to avoid OOM.
LigerKernel for SFT
----------------------
LigerKernel is a high-performance kernel for Supervised Fine-Tuning (SFT) that can improve training efficiency. To enable LigerKernel in your SFT training:
1. Install liger-kernel via ``pip3 install liger-kernel``. In your SFT configuration file (e.g., ``verl/trainer/config/sft_trainer.yaml``), set the ``use_liger`` parameter:
.. code-block:: yaml
model:
use_liger: True # Enable LigerKernel for SFT
2. The default value is ``False``. Enable it only when you want to use LigerKernel's optimizations.
3. LigerKernel is particularly useful for improving training performance in SFT scenarios.
Prepare Data for Post-Training
========================================
Before starting the post-training job, we need to prepare the data for
the policy training. The data should be stored in the parquet format.
We provide several data preprocess scripts for different datasets,
including GSM8K, MATH, HelloSwag, Full_hh_rlhf. To prepare other datasets, we need
to follow the following steps: The data preprocess script can be divided
into two parts:
1. The first part is the common part, which loads the dataset from
huggingface's ``datasets`` package. Then preprocess the datasets with
the ``make_map_fn`` and then store in the parquet format.
.. code:: python
import re
import os
import datasets
from verl.utils.hdfs_io import copy, makedirs
import argparse
# To extract the solution for each prompts in the dataset
# def extract_solution(solution_str):
# ...
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--local_dir', default='/opt/tiger/gsm8k')
parser.add_argument('--hdfs_dir', default=None)
args = parser.parse_args()
num_few_shot = 5
data_source = 'openai/gsm8k'
dataset = datasets.load_dataset(data_source, 'main')
train_dataset = dataset['train']
test_dataset = dataset['test']
# Construct a `def make_map_fn(split)` for the corresponding datasets.
# ...
train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True)
test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True)
local_dir = args.local_dir
hdfs_dir = args.hdfs_dir
train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))
makedirs(hdfs_dir)
copy(src=local_dir, dst=hdfs_dir)
2. The users are required to implement the ``make_map_fn()`` function
(as well as the ``extract_solution``) on their own to support
different datasets or tasks.
We already implemented the data preprocess of GSM8k, MATH, Hellaswag and Full_hh_rlhf
datasets. And we take the GSM8k dataset as an example:
**GSM8K**
In the ``make_map_fn``, each data field should consist of the following
5 fields:
1. ``data_source``: The name of the dataset. To index the corresponding
reward function in the ``RewardModule``
2. ``prompt``: This field should be constructed in the format of
huggingface chat_template. The tokenizer in ``RLHFDataset`` will
apply chat template and tokenize the prompt.
3. ``ability``: Define the task category.
4. ``reward_model``: Currently, we only utilize the ``ground_truth``
field during evaluation. The ``ground_truth`` is computed by the
``extract_solution`` function. **NOTED** that the implementation of
the corresponding reward function should align with this extracted
``ground_truth``.
5. ``extra_info``: Record some information of the current prompt. Not
use for now.
.. code:: python
def extract_solution(solution_str):
solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str) # extract the solution after ####
assert solution is not None
final_solution = solution.group(0)
final_solution = final_solution.split('#### ')[1].replace(',', '')
return final_solution
instruction_following = "Let's think step by step and output the final answer after \"####\"."
# add a row to each data item that represents a unique id
def make_map_fn(split):
def process_fn(example, idx):
question = example.pop('question')
question = question + ' ' + instruction_following
answer = example.pop('answer')
solution = extract_solution(answer)
data = {
"data_source": data_source,
"prompt": [{
"role": "user",
"content": question
}],
"ability": "math",
"reward_model": {
"style": "rule",
"ground_truth": solution
},
"extra_info": {
'split': split,
'index': idx
}
}
return data
return process_fn
Implement Reward Function for Dataset
======================================
For each dataset, we need to implement a reward function or utilize a reward model to compute the rewards for the generated responses.
We already pre-implemented some reward functions in `reward_score directory <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score>`_.
You can also use customized reward functions.
Currently, we support reward functions for GSM8k and MATH datasets. For RLHF datasets (e.g.,
full_hh_rlhf) and Code Generation (e.g., APPS), we utilize reward model
and SandBox (will opensource soon) for evaluation respectively.
RewardManager
-------------
In the entrypoint of the PPO Post-Training script `main_ppo.py <https://github.com/volcengine/verl/blob/main/verl/trainer/main_ppo.py#L33>`_,
we implement a ``RewardManager`` that utilize pre-implemented reward functions to compute the scores for each response.
In the ``RewardManager``, we implemented a ``__call__`` function to
compute the score for each response.
All the reward functions are executed by ``compute_score_fn``.
The input is a ``DataProto``, which includes:
- ``input_ids``, ``attention_mask``: ``input_ids`` and ``attention_mask`` after applying
chat_template, including prompt and response
- ``responses``: response tokens
- ``ground_truth``: The ground truth string of the current prompt.
Stored in ``non_tensor_batch`` in the ``DataProto``, which should be
preprocessed in the parquet files.
- ``data_source``: The dataset name of the current prompt. Stored in
``non_tensor_batch`` in the ``DataProto``, which should be
preprocessed in the parquet files.
After detokenize the responses, the responses string and the ground
truth string will be input to the ``compute_score_fn`` to compute the
score for each response.
Reward Functions
----------------
Pre-implemented
~~~~~~~~~~~~~~~
We already pre-implemented some reward functions in `reward_score directory <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score>`_.
- In the `GSM8k example <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score/gsm8k.py>`_, we
force the response to output the final answer after four ####, then
use string matching to compare with the ground truth. If completely
correct, score 1 point; if the format is correct, score 0.1 points; if
the format is incorrect, score 0 points.
- In the `MATH example <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score/math.py>`_, we follow
the implementation in `lm-evaluation-harness repository <https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/hendrycks_math/utils.py>`_.
Customized
~~~~~~~~~~
You can implement customized reward functions in a separate file and specify them using ``custom_reward_function.path`` and ``custom_reward_function.name``. For the set of them, please refer to :ref:`config-explain-page`.
The parameters of your reward function should be ``data_source``, ``solution_str``, ``ground_truth``, and ``extra_info``.
For example:
.. code:: python
def my_reward_fn(data_source, solution_str, ground_truth, extra_info=None):
return len(solution_str)/100
If you are testing only a single customized reward function, you can simply name it 'compute_score' and leave ``custom_reward_function.name`` unset.
To run multiple tests with different customized reward functions, you can modify both ``custom_reward_function.path`` and ``custom_reward_function.name`` for each trial.
For instance, you might create a single `my_reward.py` file and implement multiple reward functions within it. This way, for different trials, you only need to adjust ``custom_reward_function.name``, making it more convenient to conduct multiple tests within scripts.
# markdown support
recommonmark
myst_parser
# markdown table support
sphinx-markdown-tables
# theme default rtd
# crate-docs-theme
sphinx-rtd-theme
# pin tokenizers version to avoid env_logger version req
tokenizers==0.19.1
Multi-turn Rollout Support
==========================
Basic Configuration
~~~~~~~~~~~~~~~~~~~
To enable multi-turn rollout, make sure to configure the following fields in your rollout configuration:
.. code-block:: yaml
actor_rollout_ref:
rollout:
multi_turn: True
name: "sglang"
These configuration activates the sglang engine for multi-turn interaction during rollout.
Custom Tool Configuration
~~~~~~~~~~~~~~~~~~~~~~~~~
For custom environment interaction tools, you can implement your own tools based on ``verl.tools.base_tool.BaseTool``. Then, specify your tool configurations in a YAML file:
.. code-block:: yaml
tools:
- class_name: ""
config: {}
tool_schema:
You may refer to GSM8KTool_example_configuration_, which is one example of the tool configurations. Its implementation can be found in gsm8k_tool.py_.
Finally, set the ``tools_config_file`` in your rollout config:
.. code-block:: yaml
actor_rollout_ref:
rollout:
tool_kwargs:
tools_config_file: <path_to_tool_yaml_file>
This allows integration of customized tool behaviors during actor rollout steps.
Multi-turn Tokenization
~~~~~~~~~~~~~~~~~~~~~~~
Tokenizing multi-turn rollouts poses a challenge: after applying the chat template and tokenizing the full message list, it’s hard to identify which tokens belong to assistant messages. Since the token list is flat, it lacks direct alignment with the message roles.
To address this, we adopt a **delta-based tokenization** strategy. Each time the LLM generates a new message, we:
1. Apply the chat template to all prior messages (`messages[:i]`).
2. Apply the chat template again including the latest message (`messages[:i+1]`).
3. Tokenize only the *delta* between these two serialized message strings.
This ensures that only tokens generated by the assistant are included in the loss mask.
.. code-block:: python
# Exclude the assistant prompt (e.g., "<|im_start|>assistant") from the loss by setting add_generation_prompt=True
prev = tokenizer.apply_chat_template(messages[:i], add_generation_prompt=True, tokenize=False)
curr = tokenizer.apply_chat_template(messages[:i+1], add_generation_prompt=False, tokenize=False)
token_ids += tokenizer.encode(curr[len(prev):], add_special_tokens=False)
loss_mask += [1] * len(token_ids) # Mask only the new assistant tokens
While we’ve validated this produces consistent results with full message tokenization, future models' chat template could break compatibility. To guard against silent inconsistencies, we compare the delta-based tokenization with full-tokenization results by default at the end of each rollout.
If you see the following warning, enable `INFO` log level to inspect the mismatched outputs:
.. code-block::
Inconsistent training and inference tokenization detected. This may lead to unexpected behavior during training. Please review your chat template to determine if this is intentional. For more information, refer to the multiturn README.md.
If the discrepancy is expected, you can disable the sanity check via:
``actor_rollout_ref.rollout.multi_turn.enable_tokenization_sanity_check=False``
Special Cases
^^^^^^^^^^^^^
Some models (e.g., Qwen/QwQ-32B and Qwen3 series) remove internal reasoning content during chat template rendering. As a result, the message content can vary across turns, making the delta-based tokenization inaccurate.
For example, for the following conversation:
.. code-block:: python
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "What is 2 + 2?"},
{"role": "assistant", "content": "<think>user asked about a simple math question.</think> 2 + 2 = 4."},
{"role": "user", "content": "Explain why."},
{"role": "assistant", "content": "<think>user wants to know the reasoning behind the answer. Search for a good explanation</think>",
"tool_calls": [{"id": "tool1", "type": "search", "arguments": {"query": "Why is 2 + 2 = 4?"}}]},
{"role": "tool", "content": "The sum of two and two is four because it is a basic arithmetic operation."},
{"role": "assistant", "content": "<think>The tool provided a good explanation.</think>The sum of two and two is four because it is a basic arithmetic operation."}
]
1. Qwen/QwQ-32B will remove all reasoning content except the last assistant message after applying the chat template.
.. code-block:: text
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
What is 2 + 2?<|im_end|>
<|im_start|>assistant
2 + 2 = 4.<|im_end|>
<|im_start|>user
Explain why.<|im_end|>
<|im_start|>assistant
<tool_call>
{"name": "", "arguments": {"query": "Why is 2 + 2 = 4?"}}
</tool_call><|im_end|>
<|im_start|>user
<tool_response>
The sum of two and two is four because it is a basic arithmetic operation.
</tool_response><|im_end|>
<|im_start|>assistant
<think>The tool provided a good explanation.</think> The sum of two and two is four because it is a basic arithmetic operation.<|im_end|>
2. Qwen3 series will remove all reasoning content before the last user message.
.. code-block:: text
<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
What is 2 + 2?<|im_end|>
<|im_start|>assistant
2 + 2 = 4.<|im_end|>
<|im_start|>user
Explain why.<|im_end|>
<|im_start|>assistant
<think>
user wants to know the reasoning behind the answer. Search for a good explanation
</think>
<tool_call>
{"name": "", "arguments": {"query": "Why is 2 + 2 = 4?"}}
</tool_call><|im_end|>
<|im_start|>user
<tool_response>
The sum of two and two is four because it is a basic arithmetic operation.
</tool_response><|im_end|>
<|im_start|>assistant
<think>
The tool provided a good explanation.
</think>
The sum of two and two is four because it is a basic arithmetic operation.<|im_end|>
To handle this, we fall back to a **fixed base conversation** containing only a single system and user message. Since this base doesn’t include assistant messages or reasoning content, it remains consistent across turns.
.. code-block:: python
BASE_CHAT_HISTORY = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "I am a user."}
]
prev = tokenizer.apply_chat_template(BASE_CHAT_HISTORY, add_generation_prompt=True, tokenize=False)
curr = tokenizer.apply_chat_template([*BASE_CHAT_HISTORY, messages[i]], add_generation_prompt=False, tokenize=False)
token_ids += tokenizer.encode(curr[len(prev):], add_special_tokens=False)
loss_mask += [1] * len(token_ids)
This method works well for Qwen3 series. However, Qwen/QwQ-32B currently has a bug in its chat template. A fix_ has been proposed but not yet adopted. Until then, use the following command to download the fixed model revision:
.. code-block:: bash
pip install huggingface_hub
huggingface-cli download Qwen/QwQ-32B --revision refs/pr/81
.. _fix: https://huggingface.co/Qwen/QwQ-32B/discussions/81
Discrepancy Between Training and Inference Templates
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Although the above approach fixes the delta mismatch issue, the removal of reasoning content in the inference-time chat template introduces a new discrepancy: training uses the full reasoning content, while inference does not.
This mismatch can affect model performance in unpredictable ways. To avoid it, we default to using the full response (including reasoning) for both training and rollout.
However, this approach comes with trade-offs:
1. Long reasoning contents can easily exceed the model’s context window, especially in multi-turn rollout.
2. There’s a mismatch between rollout and production environment now—models will not have reasoning content from past turns if you use the default chat template in production.
We are still evaluating the impact of these issues. If you experience context length problems or prefer rollouts that match production (i.e., exclude reasoning), you can enable:
``actor_rollout_ref.rollout.multi_turn.use_inference_chat_template = True``
GSM8K Multi-turn Training Performance
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
See the training performance of multi-turn rollout on the GSM8K task HERE_.
.. _HERE: https://wandb.ai/zhaochenyang20/gsm8k_async_rl/runs/1ro1r7om?nw=nwuserzhaochenyang20
.. _GSM8KTool_example_configuration: https://github.com/volcengine/verl/blob/main/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml
.. _gsm8k_tool.py: https://github.com/volcengine/verl/blob/main/verl/tools/gsm8k_tool.py
Search Tool Integration
~~~~~~~~~~~~~~~~~~~~~~~
.. toctree::
:maxdepth: 1
search_tool_example
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment