update LLM generation with tools in verl

6432075e · root · 560f8208 · 6432075e · 6432075e · 560f8208
Commit 6432075e authored Aug 27, 2025 by root
349 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,10 @@
 *.slurm*
 /*.sh*
 **/tmp/
+/data/*
+/ret_one/*
+/testbench/*
+

 # Byte-compiled / optimized / DLL files
 __pycache__/

--- a/README.md
+++ b/README.md
@@ -5,7 +5,10 @@
 ### 数据预处理
 脚本是`examples/data_preprocess/codev.py`，`scripts/preprocess.sh`里面记录了对它的调用，包括原始数据和处理后数据的路径。

-## 例子
+### 核心机制
+verl v0.4.0里面工具调用rollout的核心机制在`verl/workers/rollout/sglang_rollout/sglang_rollout.py`的`_async_rollout_a_request`函数和`verl/workers/rollout/schemas.py`的`AsyncRolloutRequest`类里面。
+
+## 例子(RL)

 下面是一个ereus2数据集的例子，配置文件是`examples/tir/run_tir.sh`。

@@ -224,8 +227,7 @@ builtins.print = traced_print

 #### Prompt过滤长度没加tool导致后续prompt长度超了报错

-verl在`verl/utils/dataset/rl_dataset.py`的`_read_files_and_tokenize`函数里面初筛prompt长度的时候没有把tool对应的prompt给算上，导致prompt长度估计短了，在后续运行的时候会因为长度超了报错。
-
+verl在`verl/utils/dataset/rl_dataset.py`的`_read_files_and_tokenize`函数里面初筛prompt长度的时候没有把tool对应的prompt给算上，导致prompt长度估计短了，在后续运行的时候会因为长度超了报错。对此我在预处理的时候把长度筛选给加回来了（`examples/data_preprocess/codev.py`）。


 #### 最后贴一个（应该是）正常的初始测试集准确率
@@ -253,4 +255,8 @@ verl在`verl/utils/dataset/rl_dataset.py`的`_read_files_and_tokenize`函数里
 "'val-core/taco/reward/best@2/mean': 0.0, 'val-core/taco/reward/best@2/std': "
 "0.0, 'val-aux/taco/reward/worst@2/mean': 0.0, "
 "'val-aux/taco/reward/worst@2/std': 0.0}")
-```
\ No newline at end of file
+```
+
+## 例子(Inference)
+verl目前的`verl/trainer/main_generation.py`不支持工具调用，我复制了一些RL训练的代码把它改的支持工具调用了，文件在`verl/trainer/custom_generation.py`，调用脚本在`generate_test.sh`。
+此外还对`verl/trainer/config/generation.yaml`进行了一些小修改，适配了一点配置。
\ No newline at end of file
--- a/docker/Apptainerfile.rocm
+++ b/docker/Apptainerfile.rocm
-Bootstrap: docker
-
-# Support - Traing: fsdp; Inference: vllm
-# FROM: rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
-# Support - Traing: fsdp; Inference: vllm, sglang
-FROM lmsysorg/sglang:v0.4.5-rocm630
-
-%environment
-    export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
-
-    export HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__"
-    export CFLAGS="-D__HIP_PLATFORM_AMD__"
-    export CXXFLAGS="-D__HIP_PLATFORM_AMD__"
-
-%post
-    # Create source directory
-    mkdir -p /opt/src
-
-    # Uninstall and reinstall vllm
-    pip uninstall -y vllm
-    cd /opt/src
-    git clone -b v0.6.3 https://github.com/vllm-project/vllm.git
-    cd vllm
-    MAX_JOBS=$(nproc) python3 setup.py install
-    cd /opt
-    rm -rf /opt/src/vllm
-
-    # Install dependencies
-    pip install "tensordict<0.6" --no-deps
-    pip install accelerate \
-        codetiming \
-        datasets \
-        dill \
-        hydra-core \
-        liger-kernel \
-        numpy \
-        pandas \
-        peft \
-        "pyarrow>=15.0.0" \
-        pylatexenc \
-        "ray[data,train,tune,serve]" \
-        torchdata \
-        transformers \
-        wandb \
-        orjson \
-        pybind11
-
-    # Clone and install verl from GitHub
-    cd /opt
-    git clone https://github.com/volcengine/verl.git
-    cd verl
-    # Uncomment to use a specific version
-    # git checkout v0.3.0.post0
-    pip install -e . --no-deps
-
-    # Install torch_memory_saver
-    pip install git+https://github.com/ExtremeViscent/torch_memory_saver.git --no-deps
\ No newline at end of file
--- a/docker/Dockerfile.awsefa
+++ b/docker/Dockerfile.awsefa
-FROM whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6-mcore0.12.0-te2.3
-
-# For aws instances with EFA net interface (Sagemaker AI Pod)
-#     install EFA driver:
-######## AWS EFA ############
-ENV NCCL_VERSION=2.25.1-1
-ENV DEBIAN_FRONTEND=noninteractive
-ENV EFA_INSTALLER_VERSION=1.40.0
-ENV AWS_OFI_NCCL_VERSION=1.14.2
-ENV FI_EFA_SET_CUDA_SYNC_MEMOPS=0
-ENV FI_PROVIDER=efa
-
-RUN apt update && apt install -y linux-image-generic libhwloc-dev
-
-RUN cd /tmp && \
-    curl -O https://efa-installer.amazonaws.com/aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz  && \
-    tar -xf aws-efa-installer-${EFA_INSTALLER_VERSION}.tar.gz && \
-    cd aws-efa-installer && \
-    ./efa_installer.sh -y -g --skip-kmod --skip-limit-conf --no-verify && \
-    ldconfig && \
-    rm -rf /tmp/aws-efa-installer /var/lib/apt/lists/*
-
-# NCCL EFA Plugin
-RUN cd /tmp && \
-    curl -LO https://github.com/aws/aws-ofi-nccl/archive/refs/tags/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
-    tar -xzf /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
-    rm /tmp/v${AWS_OFI_NCCL_VERSION}.tar.gz && \
-    mv aws-ofi-nccl-${AWS_OFI_NCCL_VERSION} aws-ofi-nccl && \
-    cd /tmp/aws-ofi-nccl && \
-    ./autogen.sh && \
-    ./configure --prefix=/opt/amazon/efa \
-    --with-libfabric=/opt/amazon/efa \
-    --with-cuda=/usr/local/cuda \
-    --enable-platform-aws \
-    --with-mpi=/opt/amazon/openmpi && \
-    make -j$(nproc) install && \
-    rm -rf /tmp/aws-ofi/nccl
-
-# NCCL
-RUN echo "/usr/local/lib"      >> /etc/ld.so.conf.d/local.conf && \
-    echo "/opt/amazon/openmpi/lib" >> /etc/ld.so.conf.d/efa.conf && \
-    ldconfig
-
-ENV OMPI_MCA_pml=^cm,ucx            \
-    OMPI_MCA_btl=tcp,self           \
-    OMPI_MCA_btl_tcp_if_exclude=lo,docker0,veth_def_agent \
-    OPAL_PREFIX=/opt/amazon/openmpi \
-    NCCL_SOCKET_IFNAME=^docker,lo,veth_def_agent  \
-    FI_EFA_USE_HUGE_PAGE=0
-
-# docker build -t whatcanyousee/verl:awsefa --label "commit=$(git rev-parse --short HEAD)" .
-# on aws:
-# docker run --ipc=host --privileged --name verldev --gpus all --network=host --shm-size=1800gb -itd whatcanyousee/verl:awsefa
--- a/docker/Dockerfile.megatron
+++ b/docker/Dockerfile.megatron
-FROM verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3
-
-RUN pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
-
-RUN cd /opt/nvidia && git clone --single-branch --branch core_r0.11.0 https://github.com/NVIDIA/Megatron-LM.git Megatron-LM
-
-# only config pip index with https://pypi.tuna.tsinghua.edu.cn/simple if needed
-# unset for now
-RUN cd /opt/nvidia/Megatron-LM && pip3 install --no-deps -e .
\ No newline at end of file
--- a/docker/Dockerfile.ngc.vllm
+++ b/docker/Dockerfile.ngc.vllm
-# docker buildx build --platform linux/x86_64 -t "verlai/verl:ngc-th2.4.0-cu124-vllm0.6.3-ray2.4-te1.7-v0.0.6" -f docker/Dockerfile.ngc.vllm . --builder cloud-verlai-verl-builder --progress=plain --push
-FROM nvcr.io/nvidia/pytorch:24.05-py3
-
-# uninstall nv-pytorch fork
-RUN pip3 uninstall pytorch-quantization \
-    pytorch-triton \
-    torch \
-    torch-tensorrt \
-    torchvision \
-    xgboost transformer_engine flash_attn \
-    apex megatron-core -y
-
-RUN pip3 install torch==2.4.0 torchvision==0.19.0 torchaudio==2.4.0 --index-url https://download.pytorch.org/whl/cu124
-
-# =============== Megatron dependencies (optional) =================
-# install apex, set MAX_JOBS to avoid OOMs
-RUN MAX_JOBS=4 pip3 install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
-    --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" \
-    git+https://github.com/NVIDIA/apex
-# =============== End of Megatron dependencies (optional) =================
-
-RUN pip3 install --no-cache-dir \
-    accelerate \
-    codetiming \
-    datasets \
-    dill \
-    hydra-core \
-    numpy \
-    'pandas' \
-    'peft' \
-    'pyarrow>=15.0.0' \
-    'pybind11' \
-    'pylatexenc' \
-    'ray>=2.10' \
-    'tensordict<0.6' \
-    'transformers' \
-    'vllm==0.6.3.post1' \
-    'wandb'
-
-# full dependencies
-RUN pip3 install pytest pre-commit py-spy pyext liger-kernel
-
-# =============== Megatron dependencies (optional) =================
-# install Transformer Engine, which requires FA 2.5.8. Do it in a separate step for docker cache
-RUN MAX_JOBS=4 NINJA_FLAGS="-j4" pip3 install flash-attn==2.5.8 --no-cache-dir --no-build-isolation
-RUN MAX_JOBS=1 NINJA_FLAGS="-j1" TE_BUILD_WITH_NINJA=0 pip3 install git+https://github.com/eric-haibin-lin/TransformerEngine.git@v1.7.0
-# =============== End of Megatron dependencies (optional) =================
--- a/docker/Dockerfile.ngc.vllm0.8
+++ b/docker/Dockerfile.ngc.vllm0.8
-# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
-FROM nvcr.io/nvidia/pytorch:24.08-py3
-
-# Define environments
-ENV MAX_JOBS=32
-ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
-ENV DEBIAN_FRONTEND=noninteractive
-ENV NODE_OPTIONS=""
-ENV PIP_ROOT_USER_ACTION=ignore
-ENV HF_HUB_ENABLE_HF_TRANSFER="1"
-
-# Define installation arguments
-ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
-ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-
-# Set apt source
-RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
-    { \
-    echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
-    } > /etc/apt/sources.list
-
-# Install systemctl
-RUN apt-get update && \
-    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
-    apt-get clean
-
-# Install tini
-RUN apt-get update && \
-    apt-get install -y tini && \
-    apt-get clean
-
-# Change pip source
-RUN pip config set global.index-url "${PIP_INDEX}" && \
-    pip config set global.extra-index-url "${PIP_INDEX}" && \
-    python -m pip install --upgrade pip
-
-# Uninstall nv-pytorch fork
-RUN pip uninstall -y torch torchvision torchaudio \
-    pytorch-quantization pytorch-triton torch-tensorrt \
-    xgboost transformer_engine flash_attn apex megatron-core grpcio
-
-# Install torch-2.6.0+cu124 + vllm-0.8.3
-# torch-2.6.0+cu124: cxx11abi=False
-# torch-2.6.0+cu126: cxx11abi=True
-# see https://github.com/flashinfer-ai/flashinfer/issues/911
-RUN pip install --no-cache-dir "vllm==0.8.3" "torch==2.6.0" "torchvision==0.21.0" "torchaudio==2.6.0" "tensordict==0.6.2" torchdata \
-    "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
-    "numpy<2.0.0" "pyarrow>=15.0.0" pandas \
-    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler \
-    pytest py-spy pyext pre-commit ruff
-
-# Install flash-attn-2.7.4.post1 (cxx11abi=False)
-RUN wget -nv https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
-    pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-
-# Install flashinfer-0.2.2.post1+cu124 (cxx11abi=False)
-# vllm-0.8.3 does not support flashinfer>=0.2.3
-# see https://github.com/vllm-project/vllm/pull/15777
-RUN wget -nv https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.2.post1/flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl && \
-    pip install --no-cache-dir flashinfer_python-0.2.2.post1+cu124torch2.6-cp38-abi3-linux_x86_64.whl
-
-# Fix packages
-RUN pip uninstall -y pynvml nvidia-ml-py && \
-    pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
-
-# Install verl
-RUN pip install --no-cache-dir verl[vllm] -U
-
-# Reset pip config
-RUN pip config unset global.index-url && \
-    pip config unset global.extra-index-url
--- a/docker/Dockerfile.ngc.vllm0.8.sagemaker
+++ b/docker/Dockerfile.ngc.vllm0.8.sagemaker
-# Using a pre-built image from AWS DLC which contains the current version of python (3.10) and supported cuda version (12.1)
-FROM 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-training:2.1.0-transformers4.36.0-gpu-py310-cu121-ubuntu20.04
-
-# uninstall nv-pytorch fork
-RUN pip3 uninstall -y pytorch-quantization \
-    pytorch-triton torch torch-tensorrt torchvision \
-    xgboost transformer_engine flash_attn apex megatron-core
-
-# Define environments
-ENV MAX_JOBS=32
-ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
-ENV DEBIAN_FRONTEND=noninteractive
-ENV NODE_OPTIONS=""
-ENV HF_HUB_ENABLE_HF_TRANSFER="1"
-
-# Install systemctl
-RUN apt-get update && \
-    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
-    apt-get clean
-
-# Install tini
-RUN apt-get update && \
-    apt-get install -y tini && \
-    apt-get clean
-
-# Install torch-2.6.0 + vllm-0.8.2
-RUN pip install --no-cache-dir vllm==0.8.2 torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 tensordict torchdata==0.11.0 \
-    transformers>=4.49.0 accelerate datasets peft hf-transfer \
-    ray[default] codetiming hydra-core pandas pyarrow>=15.0.0 pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler \
-    pytest pre-commit py-spy pyext ruff
-
-# Install flash_attn-2.7.4.post1
-RUN pip uninstall -y transformer-engine flash-attn && \
-    pip install flash-attn==2.7.4.post1 --no-build-isolation
-
-# Fix cv2
-RUN pip uninstall -y pynvml nvidia-ml-py && \
-    pip install --no-cache-dir nvidia-ml-py>=12.560.30 opencv-python-headless==4.8.0.74 fastapi==0.115.6 && \
-    pip install --no-cache-dir --upgrade optree>=0.13.0
-
-# Install verl
-RUN pip install --no-cache-dir verl[vllm] -U
-
-# Reset pip config
-RUN pip config unset global.index-url && \
-    pip config unset global.extra-index-url
--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
-#  Build the docker in the repo dir:
-# docker build -f docker/Dockerfile.rocm -t verl-rocm:03.04.2015 .
-# docker images # you can find your built docker
-
-
-# Support - Traing: fsdp; Inference: vllm
-# FROM rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
-# Support - Traing: fsdp; Inference: vllm, sglang
-FROM lmsysorg/sglang:v0.4.6.post5-rocm630
-
-# Set working directory
-# WORKDIR $PWD/app
-
-# Set environment variables
-ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
-
-ENV HIPCC_COMPILE_FLAGS_APPEND="--amdgpu-target=gfx90a;gfx942 -D__HIP_PLATFORM_AMD__"
-ENV CFLAGS="-D__HIP_PLATFORM_AMD__"
-ENV CXXFLAGS="-D__HIP_PLATFORM_AMD__"
-
-# Install vllm
-RUN pip uninstall -y vllm && \
-    rm -rf vllm && \
-    git clone -b v0.6.3 https://github.com/vllm-project/vllm.git && \
-    cd vllm && \
-    MAX_JOBS=$(nproc) python3 setup.py install && \
-    cd .. && \
-    rm -rf vllm
-
-# Copy the entire project directory
-COPY . .
-
-# Install dependencies
-RUN pip install "tensordict==0.6.2" --no-deps && \
-    pip install accelerate \
-    codetiming \
-    datasets \
-    dill \
-    hydra-core \
-    liger-kernel \
-    numpy \
-    pandas \
-    peft \
-    "pyarrow>=15.0.0" \
-    pylatexenc \
-    "ray[data,train,tune,serve]<2.45.0" \
-    torchdata \
-    transformers \
-    wandb \
-    orjson \
-    pybind11 && \
-    pip install -e . --no-deps
-
-# Install torch_memory_saver
-RUN pip install git+https://github.com/ExtremeViscent/torch_memory_saver.git --no-deps
--- a/docker/Dockerfile.sglang
+++ b/docker/Dockerfile.sglang
-# Start from the NVIDIA official image (ubuntu-22.04 + python-3.10)
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
-FROM nvcr.io/nvidia/pytorch:24.08-py3
-
-# Define environments
-ENV MAX_JOBS=32
-ENV DEBIAN_FRONTEND=noninteractive
-ENV NODE_OPTIONS=""
-
-# Define installation arguments
-ARG APT_SOURCE=https://mirrors.ustc.edu.cn/ubuntu/
-
-# Set apt source
-RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
-    { \
-    echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
-    } > /etc/apt/sources.list
-
-# Install systemctl
-RUN apt-get update && \
-    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
-    apt-get clean
-
-# Install tini
-RUN apt-get update && \
-    apt-get install -y tini && \
-    apt-get clean
-
-# Change pip source
-ARG PIP_INDEX=https://mirrors.aliyun.com/pypi/simple/
-
-RUN pip config set global.index-url "${PIP_INDEX}" && \
-    pip config set global.extra-index-url "${PIP_INDEX}" && \
-    python -m pip install --upgrade pip
-
-# Install sglang-0.4.6.post5 and torch-memory-saver
-RUN pip uninstall -y cuda-python && pip install "sglang[all]==0.4.6.post5" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
-
-# Install torch-2.6.0
-RUN pip install --no-cache-dir torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 tensordict torchdata \
-    transformers>=4.49.0 accelerate datasets peft hf_transfer \
-    ray[default] codetiming hydra-core pandas pyarrow>=15.0.0 pylatexenc qwen-vl-utils wandb liger-kernel \
-    pytest pre-commit py-spy pyext
-
-# Install flash_attn-2.7.4.post1
-RUN pip uninstall -y transformer-engine flash-attn && \
-    wget -v https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
-    pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-
-# Fix cv2
-RUN pip uninstall -y pynvml nvidia-ml-py && \
-    pip install --no-cache-dir nvidia-ml-py>=12.560.30 opencv-python-headless==4.8.0.74 fastapi==0.115.6
--- a/docker/Dockerfile.vemlp.vllm.te
+++ b/docker/Dockerfile.vemlp.vllm.te
-# docker buildx build --platform linux/x86_64 -t "verlai/verl:$TAG" -f docker/$FILE .
-
-# the one in docker.io is an alias for the one veturbo
-# FROM vemlp-cn-beijing.cr.volces.com/veturbo/pytorch:2.4-cu124
-FROM docker.io/haibinlin/verl:v0.0.5-th2.4.0-cu124-base
-
-# only config pip index with https://pypi.tuna.tsinghua.edu.cn/simple if needed
-# unset for now
-RUN pip3 config unset global.index-url
-
-# transformers 4.47.0 contains the following bug:
-# AttributeError: 'Gemma2Attention' object has no attribute '_flash_attn_uses_top_left_mask'
-RUN pip3 install --no-cache-dir \
-    torch==2.4.0 \
-    accelerate \
-    codetiming \
-    dill \
-    hydra-core \
-    numpy \
-    pybind11 \
-    tensordict \
-    "transformers <= 4.46.0"
-
-RUN pip3 install --no-cache-dir flash-attn==2.7.0.post2 --no-build-isolation
-
-# vllm depends on ray
-RUN pip3 install --no-cache-dir vllm==0.6.3 ray==2.10
-
-# install apex
-RUN MAX_JOBS=4 pip3 install -v --disable-pip-version-check --no-cache-dir --no-build-isolation \
-    --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" \
-    git+https://github.com/NVIDIA/apex
-
-# install Transformer Engine
-# - flash-attn pinned to 2.5.3 by TransformerEngine, switch to eric-haibin-lin/TransformerEngine.git@v1.7.0 to relax version req
-# - install with: MAX_JOBS=1 NINJA_FLAGS="-j1" TE_BUILD_WITH_NINJA=0 to avoid OOM
-# - cudnn is required by TransformerEngine
-# RUN CUDNN_PATH=/opt/conda/lib/python3.11/site-packages/nvidia/cudnn \
-#     pip3 install git+https://github.com/eric-haibin-lin/TransformerEngine.git@v1.7.0
-RUN MAX_JOBS=1 NINJA_FLAGS="-j1" pip3 install flash-attn==2.5.3 --no-cache-dir --no-build-isolation
-RUN MAX_JOBS=1 NINJA_FLAGS="-j1" pip3 install git+https://github.com/NVIDIA/TransformerEngine.git@v1.7
--- a/docker/Dockerfile.vllm.sglang.megatron
+++ b/docker/Dockerfile.vllm.sglang.megatron
-# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
-FROM nvcr.io/nvidia/pytorch:24.08-py3
-
-# Define environments
-ENV MAX_JOBS=32
-ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
-ENV DEBIAN_FRONTEND=noninteractive
-ENV NODE_OPTIONS=""
-ENV PIP_ROOT_USER_ACTION=ignore
-ENV HF_HUB_ENABLE_HF_TRANSFER="1"
-
-# Define installation arguments
-ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
-ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-
-# Set apt source
-RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
-    { \
-    echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
-    } > /etc/apt/sources.list
-
-# Install systemctl
-RUN apt-get update && \
-    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
-    apt-get clean
-
-# Install tini
-RUN apt-get update && \
-    apt-get install -y tini aria2 && \
-    apt-get clean
-
-# Change pip source
-RUN pip config set global.index-url "${PIP_INDEX}" && \
-    pip config set global.extra-index-url "${PIP_INDEX}" && \
-    python -m pip install --upgrade pip
-
-# Uninstall nv-pytorch fork
-RUN pip uninstall -y torch torchvision torchaudio \
-    pytorch-quantization pytorch-triton torch-tensorrt \
-    xgboost transformer_engine flash_attn apex megatron-core grpcio
-
-# Reinstall CUDA 12.4
-RUN aria2c https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
-    mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600
-
-RUN aria2c --always-resume=true --max-tries=99999 https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
-    dpkg -i cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
-    cp /var/cuda-repo-ubuntu2204-12-4-local/cuda-*-keyring.gpg /usr/share/keyrings/ && \
-    apt-get update && \
-    apt-get -y install cuda-toolkit-12-4 && \
-    rm cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
-    update-alternatives --set cuda /usr/local/cuda-12.4 && \
-    rm -rf /usr/local/cuda-12.6
-
-# Install torch-2.6.0+cu124 + vllm-0.8.5.post1 + sglang-0.4.6.post5
-# torch-2.6.0+cu124: cxx11abi=False
-# torch-2.6.0+cu126: cxx11abi=True
-# see https://github.com/flashinfer-ai/flashinfer/issues/911
-# Install sglang-0.4.6.post1 and torch-memory-saver
-RUN pip install "sglang[all]==0.4.6.post5" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install torch-memory-saver --no-cache-dir
-
-RUN pip install --no-cache-dir "vllm==0.8.5.post1" "torch==2.6.0" "torchvision==0.21.0" "torchaudio==2.6.0" "tensordict==0.6.2" torchdata
-
-RUN pip install --no-cache-dir "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
-    "numpy<2.0.0" "pyarrow>=15.0.0" pandas \
-    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile \
-    pytest py-spy pyext pre-commit ruff
-
-# Install flash-attn-2.7.4.post1 (cxx11abi=False)
-RUN wget -nv https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
-    pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-
-# Fix packages
-RUN pip uninstall -y pynvml nvidia-ml-py && \
-    pip install --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
-
-# Install cudnn
-RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
-    dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
-    cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/ && \
-    apt-get update && \
-    apt-get -y install cudnn-cuda-12 && \
-    rm cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb
-
-RUN pip install --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
-
-# Install Apex
-RUN git clone https://github.com/NVIDIA/apex.git && \
-    cd apex && \
-    pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
-
-# Install TransformerEngine
-RUN export NVTE_FRAMEWORK=pytorch && pip3 install --no-deps --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@v2.3
-
-# Install Megatron-LM
-RUN pip3 install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.0
-
-# Fix opencv
-RUN pip install opencv-python
-
-RUN pip install opencv-fixer && \
-    python -c "from opencv_fixer import AutoFix; AutoFix()"
-
-# Install verl
-
-# Reset pip config
-RUN pip config unset global.index-url && \
-    pip config unset global.extra-index-url
-
-    RUN apt-get update && \
-    apt-get install -y aria2 libfreeimage3 libfreeimage-dev zlib1g
\ No newline at end of file
--- a/docker/Dockerfile.vllm.sglang.megatron.deepseek
+++ b/docker/Dockerfile.vllm.sglang.megatron.deepseek
-# Start from the NVIDIA official image (ubuntu-22.04 + cuda-12.6 + python-3.10)
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
-FROM nvcr.io/nvidia/pytorch:24.08-py3
-
-# Define environments
-ENV MAX_JOBS=32
-ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
-ENV DEBIAN_FRONTEND=noninteractive
-ENV NODE_OPTIONS=""
-ENV PIP_ROOT_USER_ACTION=ignore
-ENV HF_HUB_ENABLE_HF_TRANSFER="1"
-
-# Define installation arguments
-ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
-ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-
-# Set apt source
-RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
-    { \
-    echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
-    } > /etc/apt/sources.list
-
-# Install systemctl
-RUN apt-get update && \
-    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
-    apt-get clean
-
-# Install tini
-RUN apt-get update && \
-    apt-get install -y tini aria2 && \
-    apt-get clean
-
-# Change pip source
-RUN pip config set global.index-url "${PIP_INDEX}" && \
-    pip config set global.extra-index-url "${PIP_INDEX}" && \
-    python -m pip install --upgrade pip
-
-# Uninstall nv-pytorch fork
-RUN pip uninstall -y torch torchvision torchaudio \
-    pytorch-quantization pytorch-triton torch-tensorrt \
-    xgboost transformer_engine flash_attn apex megatron-core grpcio
-
-# Reinstall CUDA 12.4
-RUN aria2c https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
-    mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600
-
-RUN aria2c --always-resume=true --max-tries=99999 https://developer.download.nvidia.com/compute/cuda/12.4.1/local_installers/cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
-    dpkg -i cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
-    cp /var/cuda-repo-ubuntu2204-12-4-local/cuda-*-keyring.gpg /usr/share/keyrings/ && \
-    apt-get update && \
-    apt-get -y install cuda-toolkit-12-4 && \
-    rm cuda-repo-ubuntu2204-12-4-local_12.4.1-550.54.15-1_amd64.deb && \
-    update-alternatives --set cuda /usr/local/cuda-12.4 && \
-    rm -rf /usr/local/cuda-12.6
-
-# Install torch-2.6.0+cu124 + vllm-0.8.5.post1 + sglang-0.4.6.post5
-# torch-2.6.0+cu124: cxx11abi=False
-# torch-2.6.0+cu126: cxx11abi=True
-# see https://github.com/flashinfer-ai/flashinfer/issues/911
-# Install sglang-0.4.6.post1 and torch-memory-saver
-RUN pip install --resume-retries 999 "sglang[all]==0.4.6.post5" --no-cache-dir --find-links https://flashinfer.ai/whl/cu124/torch2.6/flashinfer-python && pip install --resume-retries 999 torch-memory-saver --no-cache-dir
-
-RUN pip install --resume-retries 999 --no-cache-dir "vllm==0.8.5.post1" "torch==2.6.0" "torchvision==0.21.0" "torchaudio==2.6.0" "tensordict==0.6.2" torchdata
-
-RUN pip install --resume-retries 999 --no-cache-dir "transformers[hf_xet]>=4.51.0" accelerate datasets peft hf-transfer \
-    "numpy<2.0.0" "pyarrow>=15.0.0" pandas \
-    ray[default] codetiming hydra-core pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler blobfile \
-    pytest py-spy pyext pre-commit ruff
-
-# Install flash-attn-2.7.4.post1 (cxx11abi=False)
-RUN wget -nv https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
-    pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-
-# Fix packages
-RUN pip uninstall -y pynvml nvidia-ml-py && \
-    pip install --resume-retries 999 --no-cache-dir --upgrade "nvidia-ml-py>=12.560.30" "fastapi[standard]>=0.115.0" "optree>=0.13.0" "pydantic>=2.9" "grpcio>=1.62.1"
-
-# Install cudnn
-RUN aria2c --max-tries=9999 https://developer.download.nvidia.com/compute/cudnn/9.8.0/local_installers/cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
-    dpkg -i cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb && \
-    cp /var/cudnn-local-repo-ubuntu2204-9.8.0/cudnn-*-keyring.gpg /usr/share/keyrings/ && \
-    apt-get update && \
-    apt-get -y install cudnn-cuda-12 && \
-    rm cudnn-local-repo-ubuntu2204-9.8.0_1.0-1_amd64.deb
-
-RUN pip install --resume-retries 999 --no-cache-dir nvidia-cudnn-cu12==9.8.0.87
-
-# Install Apex
-RUN git clone https://github.com/NVIDIA/apex.git && \
-    cd apex && \
-    pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
-
-# Install TransformerEngine
-RUN export NVTE_FRAMEWORK=pytorch && pip3 install --no-deps --no-cache-dir git+https://github.com/NVIDIA/TransformerEngine.git@v2.3
-
-# Install Megatron-LM
-RUN pip3 install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.12.1
-
-# Fix opencv
-RUN pip install opencv-python
-
-RUN pip install opencv-fixer && \
-    python -c "from opencv_fixer import AutoFix; AutoFix()"
-
-# Install verl
-
-# Reset pip config
-RUN pip config unset global.index-url && \
-    pip config unset global.extra-index-url
-
-    RUN apt-get update && \
-    apt-get install -y aria2 libfreeimage3 libfreeimage-dev zlib1g
\ No newline at end of file
--- a/docker/Dockfile.ngc.vllm0.8
+++ b/docker/Dockfile.ngc.vllm0.8
-# Start from the NVIDIA official image (ubuntu-22.04 + python-3.10)
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
-FROM nvcr.io/nvidia/pytorch:24.08-py3
-
-# uninstall nv-pytorch fork
-RUN pip3 uninstall -y pytorch-quantization \
-    pytorch-triton torch torch-tensorrt torchvision \
-    xgboost transformer_engine flash_attn apex megatron-core
-
-# Define environments
-ENV MAX_JOBS=32
-ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
-ENV DEBIAN_FRONTEND=noninteractive
-ENV NODE_OPTIONS=""
-ENV HF_HUB_ENABLE_HF_TRANSFER="1"
-
-# Define installation arguments
-ARG APT_SOURCE=https://mirrors.tuna.tsinghua.edu.cn/ubuntu/
-ARG PIP_INDEX=https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple
-
-# Set apt source
-RUN cp /etc/apt/sources.list /etc/apt/sources.list.bak && \
-    { \
-    echo "deb ${APT_SOURCE} jammy main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-updates main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-backports main restricted universe multiverse"; \
-    echo "deb ${APT_SOURCE} jammy-security main restricted universe multiverse"; \
-    } > /etc/apt/sources.list
-
-# Install systemctl
-RUN apt-get update && \
-    apt-get install -y -o Dpkg::Options::="--force-confdef" systemd && \
-    apt-get clean
-
-# Install tini
-RUN apt-get update && \
-    apt-get install -y tini && \
-    apt-get clean
-
-# Change pip source
-RUN pip config set global.index-url "${PIP_INDEX}" && \
-    pip config set global.extra-index-url "${PIP_INDEX}" && \
-    python -m pip install --upgrade pip
-
-# Install torch-2.6.0 + vllm-0.8.1
-RUN pip install --no-cache-dir vllm==0.8.1 torch==2.6.0 torchvision==0.21.0 torchaudio==2.6.0 tensordict torchdata \
-    transformers>=4.49.0 accelerate datasets peft hf-transfer \
-    ray codetiming hydra-core pandas pyarrow>=15.0.0 pylatexenc qwen-vl-utils wandb dill pybind11 liger-kernel mathruler \
-    pytest yapf py-spy pyext pre-commit ruff
-
-# Install flash_attn-2.7.4.post1
-RUN pip uninstall -y transformer-engine flash-attn && \
-    wget -nv https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.4.post1/flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl && \
-    pip install --no-cache-dir flash_attn-2.7.4.post1+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
-
-# Fix cv2
-RUN pip uninstall -y pynvml nvidia-ml-py && \
-    pip install --no-cache-dir nvidia-ml-py>=12.560.30 opencv-python-headless==4.8.0.74 fastapi==0.115.6 && \
-    pip install -U optree>=0.13.0
--- a/docs/Makefile
+++ b/docs/Makefile
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line.
-SPHINXOPTS    =
-SPHINXBUILD   = sphinx-build
-SPHINXPROJ    = verl
-SOURCEDIR     = .
-BUILDDIR      = _build
-
-# Put it first so that "make" without argument is like "make help".
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/README.md
+++ b/docs/README.md
-# verl documentations
-
-## Build the docs
-
-```bash
-# Install dependencies.
-pip install -r requirements-docs.txt
-
-# Build the docs.
-make clean
-make html
-```
-
-## Open the docs with your browser
-
-```bash
-python -m http.server -d _build/html/
-```
-Launch your browser and navigate to http://localhost:8000 to view the documentation.
--- a/docs/README_vllm0.7.md
+++ b/docs/README_vllm0.7.md
-# Upgrading to vllm >= 0.7
-
-Note: verl+vllm 0.8.3 is now stable. Please see ``docs/README_vllm0.8.md`` for upgrade guide.
-
-## Installation
-
-Note: At time of writing, verl+vllm 0.7.x supports **FSDP** for training and **vLLM** for rollout.
-
-```
-# Create the conda environment
-conda create -n verl python==3.10
-conda activate verl
-
-# Install verl
-git clone https://github.com/volcengine/verl.git
-cd verl
-pip3 install -e .
-
-# Install the latest stable version of vLLM
-pip3 install vllm==0.7.3 
-
-# Install flash-attn
-pip3 install flash-attn --no-build-isolation
-
-```
-
-Note that if you are installing lower versions of vLLM (0.7.0, 0.7.1, 0.7.2), you need to make some tiny patches manually on vllm (/path/to/site-packages/vllm after installation) after the above steps:
-
- vllm/distributed/parallel_state.py: Remove the assertion below:
-
-```
-if (world_size
-        != tensor_model_parallel_size * pipeline_model_parallel_size):
-    raise RuntimeError(
-        f"world_size ({world_size}) is not equal to "
-        f"tensor_model_parallel_size ({tensor_model_parallel_size}) x "
-        f"pipeline_model_parallel_size ({pipeline_model_parallel_size})")
-
-```
-
- vllm/executor/uniproc_executor.py: change `local_rank = rank` to `local_rank = int(os.environ["LOCAL_RANK"])`
- vllm/model_executor/model_loader/weight_utils.py: remove the `torch.cuda.empty_cache()` in `pt_weights_iterator`
-
-## Features
-
-### Use cuda graph
-
-After installation, examples using FSDP as training backends can be used. By default, the `enforce_eager` is set to True, which disables the cuda graph. To enjoy cuda graphs and the sleep mode of vLLM>=0.7, add the following lines to the bash script:
-
-```
-actor_rollout_ref.rollout.enforce_eager=False \
-actor_rollout_ref.rollout.free_cache_engine=False \
-
-```
-
-For a typical job like examples/ppo_trainer/run_qwen2-7b_seq_balance.sh, the rollout generation time is 115 seconds with vLLM0.6.3, while it is 85 seconds with vLLM0.7.0. By enabling the cudagraph, the generation duration is further reduced to 62 seconds.
-
-**Note:** Currently, if the `n` is greater than 1 in `SamplingParams` in vLLM>=0.7, there is a potential performance issue on the stability of rollout generation time (Some iterations would see generation time bursts) using vLLM's V0 Engine.
-
-### Use vLLM V1 Engine
-
-Using the vLLM V1 engine can avoid instability issues and achieve additional performance improvements. To use the V1 engine, you can first uninstall the previously installed vLLM and then follow the steps below to install the newer version.
-
-```
-git clone https://github.com/vllm-project/vllm.git
-cd vllm
-git checkout 2275784
-sed -i "903a\    data_parallel_size = world_size // pipeline_model_parallel_size // tensor_model_parallel_size" ./vllm/distributed/parallel_state.py
-VLLM_USE_PRECOMPILED=1 pip install --editable .
-```
-
-Then you can enable the V1 engine by setting `export VLLM_USE_V1=1`. In some benchmark tests, the V1 engine demonstrates a 1.5x speed improvement over the vLLM V0 engine.
-The stable support of the vLLM V1 engine is available on verl main.
--- a/docs/README_vllm0.8.md
+++ b/docs/README_vllm0.8.md
-# Upgrading to vLLM >= 0.8
-
-## Installation
-
-Note: This version of verl+vLLM 0.8+ supports **FSDP** for training and **vLLM** for rollout.
-
-```bash
-# Create the conda environment
-conda create -n verl python==3.10
-conda activate verl
-
-# Install verl
-git clone https://github.com/volcengine/verl.git
-cd verl
-pip3 install -e .
-
-# Install the latest stable version of vLLM
-pip3 install vllm==0.8.3
-
-# Install flash-attn
-pip3 install flash-attn --no-build-isolation
-
-```
-
-We have a pre-built docker image for verl+vLLM 0.8.3. You can direct import it with the following command:
-
-```bash
-docker pull hiyouga/verl:ngc-th2.6.0-cu126-vllm0.8.3-flashinfer0.2.2-cxx11abi0
-```
-
-## Features
-
-vLLM 0.8+ supports cuda graph and V1 engine by default in verl. To enable these features, remember to add the following lines to the bash script:
-
-```bash
-actor_rollout_ref.rollout.enforce_eager=False \
-actor_rollout_ref.rollout.free_cache_engine=False \
-```
-
-and also **remove** the environment variable if it exists:
-
-```bash
-# If you are using vllm<=0.6.3, you might need to set the following environment variable to avoid bugs:
-# export VLLM_ATTENTION_BACKEND=XFORMERS
-```
-
-## Notes
-
-When you just directly upgrade vllm>=0.8, some dependency packages may undergo version changes. If you encounter the following problems:
-
-```bash
-in <module> from torch.multiprocessing.reductions import ForkingPickler ImportError: cannot import name 'ForkingPickler' from 'torch.multiprocessing.reductions' (/opt/conda/lib/python3.11/site-packages/torch/multiprocessing/reductions.py)
-```
-
-You need to upgrade `tensordict` to version 0.6.2 using the command `pip install tensordict==0.6.2`.
--- a/docs/_static/js/runllm-widget.js
+++ b/docs/_static/js/runllm-widget.js
-document.addEventListener("DOMContentLoaded", function () {
-    var script = document.createElement("script");
-    script.type = "module";
-    script.id = "runllm-widget-script";
-    script.src = "https://widget.runllm.com";
-    script.setAttribute("version", "stable");
-    script.setAttribute("crossorigin", "true");
-    script.setAttribute("runllm-keyboard-shortcut", "Mod+j");
-    script.setAttribute("runllm-name", "verl Chatbot");
-    script.setAttribute("runllm-position", "TOP_RIGHT");
-    script.setAttribute("runllm-assistant-id", "679");
-    script.async = true;
-    document.head.appendChild(script);
-  });
\ No newline at end of file
--- a/docs/_static/logo.png
+++ b/docs/_static/logo.png
--- a/docs/advance/checkpoint.rst
+++ b/docs/advance/checkpoint.rst
-Using Checkpoints to Support Fault Tolerance Training
-=====================================================
-
-There could be training errors or machine failure during the whole RLHF training process, 
-so it is recommended to enable checkpoints to minimize your loss.
-
-The API Interface has already been listed in :ref:`config-explain-page`,
-and we will not repeat them. But there are still some technique details
-we hope to clarify.
-
-.. note:: 
-
-    Notice that the ``checkpoint.contents`` field has no effect to FSDP checkpoint except ``hf_model``, 
-    the other 3 fields are binded together to save and load. We recommend to include ``model``, ``optimizer`` and ``extra`` all.
-
-Checkpoint Saving Directory Structure
-------------------------------------
-
-Commonly, we use the ``default_local_dir`` declared in ``ppo_trainer.yaml`` or ``ppo_megatron_trainer.yml``
-to work as preffix when saving checkpoints, which is ``checkpoints/${trainer.project_name}/${trainer.experiment_name}``.
-
-So the inner checkpoint structure of **FSDP** is like:
-
-.. code::
-
-    checkpoints/${trainer.project_name}/${trainer.experiment_name}
-    ├── global_steps_${i}
-    │   ├── actor
-    │   │   ├── model_world_size_{self.world_size}_rank_{self.rank}.pt
-    │   │   ├── optim_world_size_{self.world_size}_rank_{self.rank}.pt
-    │   │   └── extra_state_world_size_{self.world_size}_rank_{self.rank}.pt
-    │   ├── actor_huggingface
-    │   ├── critic
-    │   │   ├── model_world_size_{self.world_size}_rank_{self.rank}.pt
-    │   │   ├── optim_world_size_{self.world_size}_rank_{self.rank}.pt
-    │   │   └── extra_state_world_size_{self.world_size}_rank_{self.rank}.pt
-    │   └── critic_huggingface
-    └── latest_checkpointed_iteration.txt
-
-All model shards, optimizers and extra states are stored together, in a sharded and distributed way.
-
-While **Megatron** current checkpoint structure is:
-
-.. code::
-
-    checkpoints/${trainer.project_name}/${trainer.experiment_name}
-    ├── global_steps_${i}
-    │   ├── actor
-    │   │   ├── huggingface     # default save tokenizer, save huggingface model if include ``hf_mode`` in checkpoint.contents
-    │   │   ├── model           # save sharded model, naming the same as Megatron
-    │   │   │   ├── mp_rank_xx_yyy          # xx is tp_rank in 2 digits, yyy is pp_rank in 3 digits
-    │   │   │   │   └── model_states.pt
-    │   │   │   └── mp_rank_xx_xxx
-    │   │   ├── optim
-    │   │   │   └── distrib_optim_pp{a}_tp{b}_cp{c}_dp{d}.pt
-    │   │   └── rng_states
-    │   └── critic
-    │   │   ├── huggingface
-    │   │   ├── model
-    │   │   ├── optim
-    │   │   └── rng_states
-    └── latest_checkpointed_iteration.txt
-
-Convert FSDP and Megatron Checkpoints to HuggingFace Format Model
-----------------------------------------------------------------
-
-We provide a tool to convert the FSDP and Megatron checkpoints to HuggingFace format model.
-The tool is located in ``scripts/model_merger.py``.
-
-The script supports two main sub-commands: `merge` (to convert and save checkpoints) and `test` (to validate merged checkpoints against a reference model).
-The arguments for the `merge` sub-command are as follows:
-
-.. code:: bash
-
-    usage: model_merger.py merge [-h] --backend {fsdp,megatron} --local_dir LOCAL_DIR [--hf_model_path HF_MODEL_PATH]
-                                [--tie-word-embedding] [--is-value-model] [--target_dir TARGET_DIR]
-                                [--hf_upload_path HF_UPLOAD_PATH] [--private]
-
-    options:
-    -h, --help            show this help message and exit
-    --backend {fsdp,megatron}
-                            The backend of the model
-    --local_dir LOCAL_DIR
-                            Path to the saved model checkpoints
-    --hf_model_path HF_MODEL_PATH
-                            (Deprecated) Path to the original Hugging Face model for config.
-    --tie-word-embedding  Whether to tie word embedding weights (currently only Megatron supported)
-    --is-value-model      Whether the model is a value model (currently only Megatron supported)
-    --target_dir TARGET_DIR
-                            Directory to save the merged huggingface model
-    --hf_upload_path HF_UPLOAD_PATH
-                            Hugging Face repository ID to upload the model
-    --private             Whether to upload the model to a private Hugging Face repository
-
-Example usage for merging Megatron checkpoints:
-
-.. code:: bash
-
-    python scripts/model_merger.py merge \
-        --backend megatron \
-        --tie-word-embedding \
-        --local_dir checkpoints/verl_megatron_gsm8k_examples/qwen2_5_0b5_megatron_saveload/global_step_1/actor \
-        --target_dir /path/to/merged_hf_model
-
-Example usage for merging FSDP checkpoints:
-
-.. code:: bash
-
-    python scripts/model_merger.py merge \
-        --backend fsdp \
-        --local_dir checkpoints/verl_fsdp_gsm8k_examples/qwen2_5_0b5_fsdp_saveload/global_step_1/actor \
-        --target_dir /path/to/merged_hf_model
-
-
-Megatron Merger details
-----------------------
-
-Current implement of decoder layers uses ``nn.ModuleList`` to store the layers, 
-and thus the model layers on every PP rank and VPP rank starts their index from 0.
-
-There are 3 ways to correct this behavior:
-
-1. Modify the decoder layer's state_dict, add ``offset`` to each layer's index, thus rewrite ``nn.ModuleList`` implementation.
-2. Modify the layer index when saving checkpoint and recover them when loading checkpoint.
-3. The Checkpoint merger do this work, calculate the actual ``offset`` from ``state_dict`` only, a little complex.
-
-Current implementation use solution 2.
-
-
-HuggingFace to Megatron DistCheckpoint details
----------------------------------------------
-
-If your model is quite huge, we recommend you to use Megatron dist-checkpoint to load the model.
-Megatron dist-checkpoint supports loading with different kinds of model parallelism,
-and it is much faster than the original checkpoint loading.
-
-To convert original HuggingFace model to Megatron dist-checkpoint,
-you can use the ``scripts/converter_hf_to_mcore.py`` script. Large MoE models are temporarily supported with CPU initialization,
-which is a little slower. While we are working on a better solution to support large models.
-
-Example command to convert the model is as follows:
-
-.. code:: bash
-
-    python scripts/converter_hf_to_mcore.py \
-        --hf_model_path Qwen/Qwen1.5-MoE-A2.7B-Chat \
-        --output_path /mnt/disk/Qwen/Qwen1.5-MoE-A2.7B-Chat \
-        --use_cpu_initialization    # Only work for MoE models
-
-
-Original Checkpoint Utils
-------------------------
-
-Original Checkpoint Utils refer to original checkpoint implementation in ``verl/models/[model]/megatron/checkpoint_utils``.
-
-We only need ``[model]_loader.py`` in original checkpoint utils now, since we get rid of storing ``hf_model`` every time (which is not recommended for large model training, try only saving sharded models if you can).
-
-.. note:: 
-
-    Note that ``[model]_loader`` only support environments where **storage clusters are able to connect with every calculation nodes**. 
-    Because it utilizes **sharded load way to minimize the loading checkpoint overhead**. 
-    Every rank loads its own data from ``state_dict`` which can be accessed by all of them.
-    While there is also no need to broadcast among DP ranks, since the saved state_dict is only produced by DP rank 0.
-
-    For users who can **only place the huggingface model on one device**, we keep the original costly implementation in ``[model]_loader_deprecated``. In this implementation, rank 0 broadcast all weights to each tp and pp rank, and then dp rank 0 broadcast to all dp ranks. There may be at risks of OOM.
-
-    To use deprecated loader, change the import package of ``load_state_dict_to_megatron_llama``.
--- a/docs/advance/dpo_extension.rst
+++ b/docs/advance/dpo_extension.rst
-Extend to other RL(HF) algorithms
-=================================
-
-We already implemented the complete training pipeline of the PPO
-algorithms. To extend to other algorithms, we analyze the high-level
-principle to use verl and provide a tutorial to implement the DPO
-algorithm. Users can follow the similar paradigm to extend to other RL algorithms.
-
-.. note:: **Key ideas**: Single process drives multi-process computation and data communication.
-
-Overall Approach
----------------
-
-Step 1: Consider what multi-machine multi-GPU computations are needed
-for each model, such as ``generate_sequence`` , ``compute_log_prob`` and
-``update_policy`` in the actor_rollout model. Implement distributed
-single-process-multiple-data (SPMD) computation and encapsulate them
-into APIs
-
-Step 2: Based on different distributed scenarios, including FSDP and 3D
-parallelism in Megatron-LM, implement single-process control of data
-interaction among multi-process computations.
-
-Step 3: Utilize the encapsulated APIs to implement the control flow
-
-Example: Online DPO
-------------------
-
-We use verl to implement a simple online DPO algorithm. The algorithm
-flow of Online DPO is as follows:
-
-1. There is a prompt (rollout) generator which has the same weight as
-   the actor model. After a batch of prompts are fed into the generator,
-   it generates N responses for each prompt.
-2. Send all the prompts + responses to a verifier for scoring, which can
-   be reward model or a rule-based function. Then sort them in pairs to
-   form a training batch.
-3. Use this training batch to train the actor model using DPO. During
-   the process, a reference policy is needed.
-
-Step 1: What are the multi-machine multi-GPU computations
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-**Sample Generator**
-
-Implementation details:
-
-.. code:: python
-
-   from verl.single_controller.base import Worker
-   from verl.single_controller.ray import RayWorkerGroup, RayClassWithInitArgs, RayResourcePool
-   import ray
-
-   @ray.remote
-   class SampleGenerator(Worker):
-       def __init__(self, config):
-           super().__init__()
-           self.config = config
-           
-       def generate_sequences(self, data):
-           pass
-
-Here, ``SampleGenerator`` can be viewed as a multi-process pulled up by
-``torchrun``, with each process running the same code (SPMD).
-``SampleGenerator`` needs to implement a ``generate_sequences`` API for
-the control flow to call. The implementation details inside can use any
-inference engine including vllm, sglang and huggingface. Users can
-largely reuse the code in
-verl/verl/workers/rollout/vllm_rollout/vllm_rollout.py and we won't
-go into details here.
-
-**ReferencePolicy inference**
-
-API: compute reference log probability
-
-.. code:: python
-
-   from verl.single_controller.base import Worker
-   import ray
-
-   @ray.remote
-   class ReferencePolicy(Worker):
-       def __init__(self):
-           super().__init__()
-           self.model = Model()
-           
-       def infer(self, data):
-           return self.model(data)
-
-**Actor update**
-
-API: Update actor model parameters
-
-.. code:: python
-
-   from verl.single_controller.base import Worker
-   import ray
-
-   @ray.remote
-   class DPOActor(Worker):
-       def __init__(self):
-           super().__init__()
-           self.model = Model()
-           self.model = FSDP(self.model)  # or other distributed strategy
-           self.optimizer = optim.Adam(self.model.parameters(), lr=1e-3)
-           self.loss_fn = xxx
-           
-       def update(self, data):
-           self.optimizer.zero_grad()
-           logits = self.model(data)
-           loss = self.loss_fn(logits)
-           loss.backward()
-           self.optimizer.step()
-
-**Notes: How to distinguish between control processes and distributed computation processes**
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
- Control processes are generally functions directly decorated with
-  ``@ray.remote``
- Computation processes are all wrapped into a ``RayWorkerGroup``.
-
-Users can reuse most of the distribtued computation logics implemented
-in PPO algorithm, including FSDP and Megatron-LM backend in
-verl/verl/trainer/ppo.
-
-Step 2: Based on different distributed scenarios, implement single-process control of multi-process data interaction
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-**The core problem to solve here is how a single process sends data to
-multiple processes, drives multi-process computation, and how the
-control process obtains the results of multi-process computation.**
-First, we initialize the multi-process ``WorkerGroup`` in the control
-process.
-
-.. code:: python
-
-   @ray.remote(num_cpus=1)
-   def main_task(config):
-       # construct SampleGenerator
-       resource_pool = RayResourcePool(process_on_nodes=[8] * 2)  # 16 GPUs
-       ray_cls = RayClassWithInitArgs(SampleGenerator, config=config)
-       # put SampleGenerator onto resource pool
-       worker_group = RayWorkerGroup(resource_pool, ray_cls)
-       
-       # construct reference policy
-
-As we can see, in the control process, multiple processes are wrapped
-into a ``RayWorkerGroup``. Inside this ``WorkerGroup``, there is a
-``self._workers`` member, where each worker is a RayActor
-(https://docs.ray.io/en/latest/ray-core/actors.html) of SampleGenerator.
-ray_trainer.md also provide an implementation of
-``MegatronRayWorkerGroup``.
-
-Assuming the model is distributed using FSDP, and there is a batch of
-data on the control process, for data parallelism, the underlying
-calling process is:
-
-.. code:: python
-
-   data = xxx
-   data_list = data.chunk(dp_size)
-
-   output = []
-   for d in data_list:
-       # worker_group._workers[i] is a SampleGenerator
-       output.append(worker_group._workers[i].generate_sequences.remote(d))
-
-   output = ray.get(output)
-   output = torch.cat(output)
-
-Single process calling multiple processes involves the following 3
-steps:
-
-1. Split the data into DP parts on the control process.
-2. Send the data to remote, call the remote computation through RPC, and
-   utilize multi-process computation.
-3. Obtain the computation results of each worker on the control process
-   and merge them.
-
-Frequently calling these 3 steps on the controller process greatly hurts
-code readability. **In verl, we have abstracted and encapsulated these 3
-steps, so that the worker's method + dispatch + collect can be
-registered into the worker_group**
-
-.. code:: python
-
-   from verl.single_controller.base.decorator import register
-
-   def dispatch_data(worker_group, data):
-       return data.chunk(worker_group.world_size)
-       
-   def collect_data(worker_group, data):
-       return torch.cat(data)
-
-   dispatch_mode = {
-       'dispatch_fn': dispatch_data,
-       'collect_fn': collect_data
-   }
-
-   @register(dispatch_mode=dispatch_mode)
-   def generate_sequences(self, data):
-       pass
-
-In this way, we can directly call the method inside the worker through
-the ``worker_group`` on the control (driver) process (which is a single
-process):
-
-.. code:: python
-
-   output = worker_group.generate_sequences(data)
-
-This single line includes data splitting, data distribution and
-computation, and data collection.
-
-Furthermore, the model parallelism size of each model is usually fixed,
-including dp, tp, pp. So for these common distributed scenarios, we have
-pre-implemented specific dispatch and collect methods,in `decorator.py <https://github.com/volcengine/verl/blob/main/verl/single_controller/base/decorator.py>`_, which can be directly used to wrap the computations.
-
-.. code:: python
-
-   from verl.single_controller.base.decorator import register, Dispatch
-
-   @register(dispatch_mode=Dispatch.DP_COMPUTE_PROTO)
-   def generate_sequences(self, data: DataProto) -> DataProto:
-       pass
-
-Here it requires the data interface to be ``DataProto``. Definition of
-``DataProto`` is in `protocol.py <https://github.com/volcengine/verl/blob/main/verl/protocol.py>`_.
-
-Step 3: Main training loop
-~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-With the above training flows, we can implement the algorithm's control
-flow. It is recommended that ``main_task`` is also a ray remote process.
-
-.. code:: python
-
-   @ray.remote(num_cpus=1)
-   def main_task(config):
-       # construct SampleGenerator
-       resource_pool = RayResourcePool(process_on_nodes=[8] * 2)  # 16 GPUs
-       ray_cls = RayClassWithInitArgs(SampleGenerator, config=config) 
-       # put SampleGenerator onto resource pool
-       sample_gen = RayWorkerGroup(resource_pool, ray_cls)
-       
-       # construct reference policy
-       ray_cls = RayClassWithInitArgs(ReferencePolicy)
-       ref_policy = RayWorkerGroup(resource_pool, ray_cls)
-       
-       # construct actor
-       ray_cls = RayClassWithInitArgs(DPOActor)  
-       dpo_policy = RayWorkerGroup(resource_pool, ray_cls)
-       
-       dataloader = DataLoader()
-       
-       for data in dataloader:
-           # generate data
-           data = sample_gen.generate_sequences(data)
-           # generate scores for each data 
-           data = generate_scores(data)
-           # generate pairwise data using scores
-           data = generate_pairwise_data(data)
-           # generate ref_log_prob
-           data.batch['ref_log_prob'] = ref_policy.infer(data)
-           # update using dpo
-           dpo_policy.update(data)
-           # logging
-
-Here, different ``WorkerGroups`` can be placed in the same resource pool or
-in different resource pools using ``create_colocated_worker_cls``
-similar as in `ray_trainer.py <https://github.com/volcengine/verl/blob/main/verl/trainer/ppo/ray_trainer.py>`_.
--- a/docs/advance/fsdp_extension.rst
+++ b/docs/advance/fsdp_extension.rst
-
-Add models with the FSDP backend
-==================================
-
-Model
--------------------------
-
-In principle, our FSDP backend can support any HF model and we can
-sychronoize the actor model weight with vLLM using `hf_weight_loader.py` under `third_party/vllm`.
-However, ``hf_weight_loader`` is will gather the full state_dict of a
-model during synchronization, which may cause OOM. We suggest using
-``dtensor_weight_loader`` which gather the full model parameter layer by
-layer to reduce the peak memory usage. We already support dtensor weight
-loader for the models below in `dtensor_weight_loader.py` under `third_party/vllm`:
-
- ``GPT2LMHeadModel``
- ``LlamaForCausalLM``
- ``LLaMAForCausalLM``
- ``MistralForCausalLM``
- ``InternLMForCausalLM``
- ``AquilaModel``
- ``AquilaForCausalLM``
- ``Phi3ForCausalLM``
- ``GemmaForCausalLM``
- ``Gemma2ForCausalLM``
- ``GPTBigCodeForCausalLM``
- ``Starcoder2ForCausalLM``
- ``Qwen2ForCausalLM``
- ``DeepseekV2ForCausalLM``
-
-To implement ``dtensor_weight_loader`` of a model that's supported in
-vLLM, follow the guide of gemma model below:
-
-1. Copy the
-   ``load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]])`` from the vllm model class
-   to ``dtensor_weight_loaders.py``
-2. Modify the arguments to
-   ``(actor_weights: Dict, vllm_model: nn.Module)``
-3. Replace the ``self`` to ``vllm_model``
-4. Add the
-   ``local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)``
-   before each ``param = params_dict[name]`` and modify the following
-   weight loading using ``local_loaded_weight``.
-5. Register the implemented dtensor weight loader to ``__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__``.
-
-.. code-block:: diff
-
-    - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
-    + def gemma_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-        ]
-    -   params_dict = dict(self.named_parameters())
-    +   params_dict = dict(vllm_model.named_parameters())
-        loaded_params = set()
-    -   for name, loaded_weight in weights:
-    +   for name, loaded_weight in actor_weights.items():
-            for (param_name, shard_name, shard_id) in stacked_params_mapping:
-                if shard_name not in name:
-                    continue
-                name = name.replace(shard_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-    +           local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-    -           weight_loader(param, loaded_weight, shard_id)
-    +           weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
-                break
-            else:
-                # lm_head is not used in vllm as it is tied with embed_token.
-                # To prevent errors, skip loading lm_head.weight.
-                if "lm_head.weight" in name:
-                    continue
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-    +           local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
-                param = params_dict[name]
-                weight_loader = getattr(param, "weight_loader",
-                                        default_weight_loader)
-    -           weight_loader(param, loaded_weight)
-    +           weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
-            loaded_params.add(name)
-        unloaded_params = params_dict.keys() - loaded_params
-        if unloaded_params:
-            raise RuntimeError(
-                "Some weights are not initialized from checkpoints: "
-                f"{unloaded_params}")
\ No newline at end of file
--- a/docs/advance/megatron_extension.rst
+++ b/docs/advance/megatron_extension.rst
-Add models with the Megatron-LM backend
-=========================================
-
-Model
-----------
-
-
-If use latest verl, we have direct support of ``GPTModel`` for Megatron backend. 
-You can use the similar way of using Megatron to pretrain custom models. 
-We list the steps here:
-
-1. Find `model_initializer.py <https://github.com/volcengine/verl/blob/main/verl/models/mcore/model_initializer.py>`_
-2. If your model is configurable by ``TransformerLayerSpec`` , you can
-   directly use ``GPTModel``. Otherwise, Please implement a new
-   ``ModelLayerSpec`` and ``ModelLayer`` here.
-3. Use the right ``LayerSpec`` , ``TransformerConfig`` and ``HuggingfaceConfig`` 
-   as arguments to initialize the GPTModel.
-4. Return the model at last.
-
-
-Add Models with old version of verl
-----------------------------------
-
-
-The most challenging aspect to use the Megatron-LM backend is implementing
-the models for training. Currently, we implement Llama model that
-support data parallelism, tensor parallelism, pipeline parallelism (also
-vPP) and sequence parallelism. We also implement remove padding (sequence packing) on Llama
-model, which can be found in `modeling_llama_megatron.py <https://github.com/volcengine/verl/blob/main/verl/models/llama/megatron/modeling_llama_megatron.py>`_.
-
-To support other model, users are required to implement:
-
-1. Implemnt a model similar to ``modeling_llama_megatron.py`` that satisfy the
-   parallelism requirements of Megatron-LM. Then register your model in
-   the `registry.py <https://github.com/volcengine/verl/blob/main/verl/models/registry.py>`_.
-2. Checkpoint utils that can load full checkpoint (e.g. huggingface
-   checkpoint) to partitioned models during the runtime. Then register
-   your loader to ``weight_loader_registry`` in `weight_loader_registry.py <https://github.com/volcengine/verl/blob/main/verl/models/weight_loader_registry.py>`_.
-3. Weight loader that synchronize the weight from Megatron to rollout
-   (vLLM) model. Note that both the actor model and rollout model are
-   partitioned during runtime. So, it's advisable to map the model name
-   in actor model implementation. Otherwise, you may need an additional
-   name mapping and even weight transformation. The weight loader implementation
-   is in `megatron_weight_loaders.py <https://github.com/volcengine/verl/blob/main/verl/third_party/vllm/vllm_v_0_6_3/megatron_weight_loaders.py>`_.
\ No newline at end of file
--- a/docs/advance/placement.rst
+++ b/docs/advance/placement.rst
-Ray API Design Tutorial
-=======================================
-
-We provide a tutorial for our Ray API design, including:
-
- Ray basic concepts
- Resource Pool and RayWorkerGroup
- Data Dispatch, Execution and Collection
- Initialize the RayWorkerGroup and execute the distributed computation in the given Resource Pool
-
-See details in `tutorial.ipynb <https://github.com/volcengine/verl/blob/main/examples/ray/tutorial.ipynb>`_.
\ No newline at end of file
--- a/docs/advance/ppo_lora.rst
+++ b/docs/advance/ppo_lora.rst
-RL(HF) algorithms with LoRA Support
-===========================================
-
-We support LoRA (Low-Rank Adaptation) for reinforcement learning algorithms such as PPO, GRPO, and others.
-
-LoRA is a parameter-efficient fine-tuning technique that injects trainable low-rank matrices into pre-trained weights (typically linear layers). This reduces memory footprint and compute cost, making it possible to fine-tune large models with limited hardware.
-
-The benefits this brings include:
-
- reinforcement learning with very large models (e.g. 70B+) with modest hardware (e.g. 8x80G GPUs),
- enable larger batch sizes due to reduced memory usage,
- simplify model transfer and deployment, as only LoRA adapters need to be saved,
- Combine with techniques like `SLoRA <https://arxiv.org/abs/2311.03285>`_ or `CCoE <https://arxiv.org/abs/2407.11686>`_ to serve multiple LoRA adapters efficiently
-
-This guide explains how to enable LoRA in RL training and configure related parameters.
-
-Usage Guide
------------------------
-1. Lora is available in the `verl.trainer.ppo.ray_trainer.RayPPOTrainer`. Examples are provided via the `verl.trainer.main_ppo` entry point.
-
-2. Currently, LoRA is supported via huggingface peft, only with fsdp/fsdp2 and vllm backend (sglang support coming soon).
-
- `strategy=fsdp` or `strategy=fsdp2`
- `rollout.name=vllm`
-
-3. Required configurations for LoRA:
-
- `actor_rollout_ref.model.lora_rank`: int, set to a reasonable value greater than 0 (e.g., 8, 16, 32, 64)
- `actor_rollout_ref.model.lora_alpha`: float, the alpha term in LoRA
- `actor_rollout_ref.rollout.load_format="safetensors"`: required. This enables vLLM to load the base model.
- `actor_rollout_ref.model.target_modules`: the target modules for LoRA. Typically set to "all-linear".
-
-4. Recommend options:
-
- `actor_rollout_ref.model.use_shm=True`: preload the model into `/dev/shm` to improve model loading speed.
- `actor_rollout_ref.rollout.layered_summon=True`: this enables the actor-model to gather the FSDP shards per layers when synchronizing the LoRA Adapter to vLLM, thereby reducing GPU peak memory. Recommended if the model is very large (70B+) or the GPU memory is limited (< 48GB)
-
-
-Best Practices and Notes
-------------------------
-
-1. **Learning rate**: it is recommended to increase the value of learning rate by an order of magnitude.
-
-2. **LoRA Rank**:
-
- Too small a rank can hurt convergence.
- LoRA rank recommendation from @thelongestusernameofall:
-
-  - A very small lora_rank can lead to slower convergence or worse training performance. It is recommended to set lora_rank to be>=32. Tests have shown that for a 0.5B model, with lora_rank=32,the training convergence speed and final performance are almost identical to non-LoRA training
-  - For a 32B model,with lora_rank=128,the training convergence speed and final performance are also almost identical to non-LoRA training.
-  - More comprehensive reference results are coming soon.
-
-.. image:: https://github.com/eric-haibin-lin/verl-community/blob/f2b80b8b26829124dd393b7a795a0640eff11644/docs/lora.jpg?raw=true
-
-3. Reference configuration for RL training with the Qwen2.5-72B model using 8 x 80GB GPUs (increase lora_rank if needed):
-
-.. code-block::
-
-    data.train_batch_size=64 \
-    actor_rollout_ref.model.use_shm=True \
-    actor_rollout_ref.model.lora_rank=32 \
-    actor_rollout_ref.model.lora_alpha=32 \
-    actor_rollout_ref.model.target_modules=all-linear \
-    actor_rollout_ref.actor.optim.lr=3e-5 \
-    actor_rollout_ref.actor.fsdp_config.fsdp_size=8 \
-    actor_rollout_ref.actor.fsdp_config.param_offload=True \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=True \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=8 \
-    actor_rollout_ref.rollout.name=vllm \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.4 \
-    actor_rollout_ref.rollout.n=5 \
-    actor_rollout_ref.rollout.max_num_seqs=64 \
-    actor_rollout_ref.rollout.max_model_len=1536 \
-    actor_rollout_ref.rollout.max_num_batched_tokens=1536 \
-    actor_rollout_ref.rollout.load_format=safetensors \
-    actor_rollout_ref.rollout.layered_summon=True \
-    actor_rollout_ref.ref.fsdp_config.param_offload=True \
-    actor_rollout_ref.actor.ulysses_sequence_parallel_size=1 \
-
-Example Script
-------------------
-
-For an end-to-end example, refer to the script below:
-
-examples/grpo_trainer/run_qwen2_5-3b_gsm8k_grpo_lora.sh
--- a/docs/advance/rope.rst
+++ b/docs/advance/rope.rst
-RoPE Scaling override
-=======================================
-
-Some models such as `Qwen/Qwen2.5-7B-Instruct <https://huggingface.co/Qwen/Qwen2.5-7B-Instruct#processing-long-texts>`_ support RoPE Scaling but don't have it defined in their config.json file.
-For example, this model supports this configuration:
-
-.. code:: python
-
-    {
-        ...,
-        "rope_scaling": {
-            "factor": 4.0,
-            "original_max_position_embeddings": 32768,
-            "type": "yarn"
-        }
-    }
-
-
-
-In order to support a longer context for such models, you must override the model configs when starting the trainer.
-
-PPO example:
-
-.. code:: bash
-
-    +actor_rollout_ref.model.override_config.rope_scaling.type=yarn \
-    +actor_rollout_ref.model.override_config.rope_scaling.factor=4.0 \
-    +actor_rollout_ref.model.override_config.rope_scaling.original_max_position_embeddings=32768 \
-
-
-And for the critic model
-
-.. code:: bash
-
-    +critic.model.override_config.rope_scaling.type=yarn \
-    +critic.model.override_config.rope_scaling.factor=4.0 \
-    +critic.model.override_config.rope_scaling.original_max_position_embeddings=32768 \
--- a/docs/algo/baseline.md
+++ b/docs/algo/baseline.md
-# Algorithm Baselines
-
-## Math related datasets
-
-Assuming GSM8k/math dataset is preprocessed via:
-
-```bash
-python3 examples/data_preprocess/*.py
-```
-
-Refer to the table below to reproduce RL training from different pre-trained checkpoints. Below is the performance on the GSM8k dataset if not specified otherwise. More comprehensive benchmark results areavailable in the recipe folder.
-
-
-| Hardware    | Model                            | Method            | Test score   | Details |
-|-------------|----------------------------------|-------------------|--------------|---------|
-| NVIDIA GPU  | google/gemma-2-2b-it             | hf checkpoint     | 23.9         | [Huggingface](https://huggingface.co/google/gemma-2-2b-it#benchmark-results) |
-| NVIDIA GPU  | google/gemma-2-2b-it             | SFT               | 52.06        | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/gemma-2-2b-it-sft-0.411.log) |
-| NVIDIA GPU  | google/gemma-2-2b-it             | SFT + PPO         | 64.02        | [command and logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/gemma-2-2b-it-ppo-bsz512_4-prompt1024-resp-512-0.640.log), [wandb](https://api.wandb.ai/links/verl-team/h7ux8602) |
-| NVIDIA GPU  | Qwen/Qwen2.5-0.5B-Instruct       | hf checkpoint     | 36.4         | [Qwen blog](https://qwenlm.github.io/blog/qwen2.5-llm/) |
-| NVIDIA GPU  | Qwen/Qwen2.5-0.5B-Instruct       | PPO               | 56.7         | [command and log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log) |
-| NVIDIA GPU  | Qwen/Qwen2.5-0.5B-Instruct       | PRIME             | 58.7         | [script](https://github.com/volcengine/verl/blob/main/recipe/prime/run_prime_qwen.sh), [wandb](https://api.wandb.ai/links/zefan-wang-thu-tsinghua-university/rxd1btvb) |
-| NVIDIA GPU  | deepseek-ai/deepseek-llm-7b-chat | PPO (Megatron)    | 69.5 [1]     | [log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/deepseek-llm-7b-chat-megatron-bsz256_4-prompt512-resp512-0.695.log), [wandb](https://wandb.ai/verl-team/verl_megatron_gsm8k_examples/runs/10fetyr3) |
-| NVIDIA GPU  | Qwen/Qwen2-7B-Instruct           | GRPO              | 89           | [script](https://github.com/volcengine/verl/blob/a65c9157bc0b85b64cd753de19f94e80a11bd871/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh) |
-| NVIDIA GPU  | Qwen/Qwen2-7B-Instruct           | GRPO (FSDP2)      | 89.8         | [log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/qwen2-7b-fsdp2.log) |
-| NVIDIA GPU  | Qwen/Qwen2-7B-Instruct           | GRPO (Megatron)   | 89.6         | [log](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/qwen2-7b_math_megatron.log) |
-| NVIDIA GPU  | Qwen/Qwen2.5-7B-Instruct         | ReMax             | 97           | [script](https://github.com/eric-haibin-lin/verl/blob/main/examples/remax_trainer/run_qwen2.5-3b_seq_balance.sh), [wandb](https://wandb.ai/liziniu1997/verl_remax_example_gsm8k/runs/vxl10pln) |
-| NVIDIA GPU  | Qwen/Qwen2.5-7B-Instruct         | SPPO              | 65.6 (MATH)  | [SPPO script](https://github.com/volcengine/verl/tree/main/recipe/sppo/README.md) |
-| NVIDIA GPU  | Mixtral-8x22B-Instruct-v0.1      | Instruct model    | 83.7         | [Qwen Blog](https://qwenlm.github.io/blog/qwen2.5-llm/) |
-| NVIDIA GPU  | Mixtral-8x22B-Instruct-v0.1      | RLOO (Megatron)   | 92.3         | [wandb](https://api.wandb.ai/links/ppo_dev/sbuiuf2d) |
-| NVIDIA GPU  | Qwen/Qwen2.5-7B-Instruct         | SPIN              | 92           | [script](https://github.com/volcengine/verl/tree/main/recipe/spin/README.md) |
-| NVIDIA GPU  | Qwen/Qwen2.5-VL-7B-Instruct      | GRPO (Megatron)   | 65.4 (GEO3k) | [script](https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh), [wandb](https://api.wandb.ai/links/megatron-core-moe-dev/1yngvkek) |
-| AMD MI300   | deepseek-ai/deepseek-llm-7b-chat | PPO               | 70.5 [1]     | [log](https://github.com/yushengsu-thu/verl_training_log/blob/main/gsm8k/ppo_run_deepseek7b_llm.log) |
-| AMD MI300   | deepseek-ai/deepseek-llm-7b-chat | GRPO              | 71.4 [1]     | [log](https://github.com/yushengsu-thu/verl_training_log/blob/main/gsm8k/grpo_run_deepseek7b_llm.log) |
-
-## Coding related datasets
-
-Below is the result on leetcode if not specified otherwise.
-
-| Hardware    | Model                            | Method            | Test score   | Details |
-|-------------|----------------------------------|-------------------|--------------|---------|
-| NVIDIA GPU  | PRIME-RL/Eurus-2-7B-SFT          | RPIME             | 36.1         | [script](https://github.com/volcengine/verl/blob/main/recipe/prime/run_prime_qwen_code.sh), [swanlab](https://swanlab.cn/@wangzefan/prime_example/runs/7f541qhspgmy8nmhdlx35/chart) |
-
-
-### Notes
-
-[1] During evaluation, we have only extracted answers following the format `"####"`. A more flexible answer extraction, longer response length, and better prompt engineering may lead to a higher score.
-
-[2] The default value of `actor_rollout_ref.actor.entropy_coeff` is set to `0.0` since verl 0.3.x on 2025-05-30, which is different from previous versions.
\ No newline at end of file
--- a/docs/algo/dapo.md
+++ b/docs/algo/dapo.md
-# Recipe: Decoupled Clip and Dynamic Sampling Policy Optimization (DAPO)
-
-> Open-Source Algorithm Implementation & Expriement Running: [Yuxuan Tong](https://tongyx361.github.io/), [Guangming Sheng](https://hk.linkedin.com/in/guangming-sheng-b50640211)
-
-🏠 [Homepage](https://dapo-sia.github.io/) | 📝 [Paper](https://dapo-sia.github.io/static/pdf/dapo_paper.pdf) | 🤗 [Datasets&Models@HF](https://huggingface.co/collections/BytedTsinghua-SIA/dapo-67d7f1517ee33c8aed059da0) | 🐱 [Code@GitHub](https://github.com/volcengine/verl/tree/gm-tyx/puffin/main/recipe/dapo) | 🐱 [Repo@GitHub](https://github.com/BytedTsinghua-SIA/DAPO)
-
-
-> We propose the **D**ecoupled Clip and Dynamic s**A**mpling **P**olicy **O**ptimization (DAPO) algorithm. By making our work publicly available, we provide the broader research community and society with practical access to scalable reinforcement learning, enabling all to benefit from these advancements. Applying DAPO training to Qwen2.5-32B base model proves to outperform the previous state-of-the-art DeepSeek-R1-Zero-Qwen-32B on AIME 2024, achieving **50%** accuracy with **50%** less training steps.
->
-> ![dapo-main-result](https://dapo-sia.github.io/static/images/score.png)
-
-## Quickstart
-
-1. Prepare the datasets **on the Ray cluster**:
-
-```bash
-bash prepare_dapo_data.sh # This downloads the datasets to ${HOME}/verl/data by default
-```
-
-2. Submit the job to the Ray cluster **from any machine**:
-
-```bash
-cd verl # Repo root
-export RAY_ADDRESS="http://${RAY_IP:-localhost}:8265" # The Ray cluster address to connect to
-export WORKING_DIR="${PWD}" # The local directory to package to the Ray cluster
-# Set the runtime environment like env vars and pip packages for the Ray cluster in yaml
-export RUNTIME_ENV="./verl/trainer/runtime_env.yaml"
-bash recipe/dapo/run_dapo_qwen2.5_32b.sh
-```
-
-## Reproduction Runs
-
-| Setup                                        | AIME 2024 Acc. | Training Script                                                  | Training Record                                                                           |
-| -------------------------------------------- | -------------- | ---------------------------------------------------------------- | ----------------------------------------------------------------------------------------- |
-| DAPO w/o Token-level Loss & Dynamic Sampling | 44%            | [run_dapo_early_qwen2.5_32b.sh](./run_dapo_early_qwen2.5_32b.sh) | [W&B](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=wmb4qxfht0n) |
-| DAPO w/o Dynamic Sampling                    | 50%            | [run_dapo_wo_ds_qwen2.5_32b.sh](./run_dapo_wo_ds_qwen2.5_32b.sh) | [W&B](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=wmb4qxfht0n) |
-| DAPO                                         | 52%            | [run_dapo_qwen2.5_32b.sh](./run_dapo_qwen2.5_32b.sh)             | [W&B](https://wandb.ai/verl-org/DAPO%20Reproduction%20on%20verl/workspace?nw=wmb4qxfht0n) |
-
-## Configuration
-
-> [!NOTE]
-> Most experiments in the paper, including the best-performant one, are run without Overlong Filtering because it's somehow overlapping with Overlong Reward Shaping in terms of properly learning from the longest outputs. So we don't implement it here.
-
-### Separated Clip Epsilons (-> Clip-Higher)
-
-An example configuration:
-
-```yaml
-actor_rollout_ref:
-  actor:
-    clip_ratio_low: 0.2
-    clip_ratio_high: 0.28
-```
-
-`clip_ratio_low` and `clip_ratio_high` specify the $\varepsilon_{\text {low }}$ and $\varepsilon_{\text {high }}$ in the DAPO objective.
-
-Core relevant code:
-
-```python
-pg_losses1 = -advantages * ratio
-pg_losses2 = -advantages * torch.clamp(ratio, 1 - cliprange_low, 1 + cliprange_high)
-pg_losses = torch.maximum(pg_losses1, pg_losses2)
-```
-
-### Dynamic Sampling (with Group Filtering)
-
-An example configuration:
-
-```yaml
-data:
-  gen_batch_size: 1536
-  train_batch_size: 512
-algorithm:
-  filter_groups:
-    enable: True
-    metric: acc # score / seq_reward / seq_final_reward / ...
-    max_num_gen_batches: 10 # Non-positive values mean no upper limit
-```
-
-Setting `filter_groups.enable` to `True` will filter out groups whose outputs' `metric` are all the same, e.g., for `acc`, groups whose outputs' accuracies are all 1 or 0.
-
-The trainer will repeat sampling with `gen_batch_size` until there are enough qualified groups for `train_batch_size` or reaching the upper limit specified by `max_num_gen_batches`.
-
-Core relevant code:
-
-```python
-prompt_bsz = self.config.data.train_batch_size
-if num_prompt_in_batch < prompt_bsz:
-    print(f'{num_prompt_in_batch=} < {prompt_bsz=}')
-    num_gen_batches += 1
-    max_num_gen_batches = self.config.algorithm.filter_groups.max_num_gen_batches
-    if max_num_gen_batches <= 0 or num_gen_batches < max_num_gen_batches:
-        print(f'{num_gen_batches=} < {max_num_gen_batches=}. Keep generating...')
-        continue
-    else:
-        raise ValueError(
-            f'{num_gen_batches=} >= {max_num_gen_batches=}. Generated too many. Please check your data.'
-        )
-else:
-    # Align the batch
-    traj_bsz = self.config.data.train_batch_size * self.config.actor_rollout_ref.rollout.n
-    batch = batch[:traj_bsz]
-```
-
-### Flexible Loss Aggregation Mode (-> Token-level Loss)
-
-An example configuration:
-
-```yaml
-actor_rollout_ref:
-  actor:
-    loss_agg_mode: "token-mean" # / "seq-mean-token-sum" / "seq-mean-token-mean"
-    # NOTE: "token-mean" is the default behavior
-```
-
-Setting `loss_agg_mode` to `token-mean` will mean the (policy gradient) loss across all the tokens in all the sequences in a mini-batch.
-
-Core relevant code:
-
-```python
-if loss_agg_mode == "token-mean":
-    loss = verl_F.masked_mean(loss_mat, loss_mask)
-elif loss_agg_mode == "seq-mean-token-sum":
-    seq_losses = torch.sum(loss_mat * loss_mask, dim=-1)  # token-sum
-    loss = torch.mean(seq_losses)  # seq-mean
-elif loss_agg_mode == "seq-mean-token-mean":
-    seq_losses = torch.sum(loss_mat * loss_mask, dim=-1) / torch.sum(loss_mask, dim=-1)  # token-mean
-    loss = torch.mean(seq_losses)  # seq-mean
-else:
-    raise ValueError(f"Invalid loss_agg_mode: {loss_agg_mode}")
-```
-
-### Overlong Reward Shaping
-
-An example configuration:
-
-```yaml
-data:
-  max_response_length: 20480 # 16384 + 4096
-reward_model:
-  overlong_buffer:
-    enable: True
-    len: 4096
-    penalty_factor: 1.0
-```
-
-Setting `overlong_buffer.enable` to `True` will penalize the outputs whose lengths are overlong but still within the hard context limit.
-
-Specifically, the penalty increases linearly from `0` to `overlong_buffer.penalty_factor` when the length of the output exceeds the `max_response_length` by `0` to `overlong_buffer.len` tokens.
-
-Core relevant code:
-
-```python
-if self.overlong_buffer_cfg.enable:
-    overlong_buffer_len = self.overlong_buffer_cfg.len
-    expected_len = self.max_resp_len - overlong_buffer_len
-    exceed_len = valid_response_length - expected_len
-    overlong_penalty_factor = self.overlong_buffer_cfg.penalty_factor
-    overlong_reward = min(-exceed_len / overlong_buffer_len * overlong_penalty_factor, 0)
-    reward += overlong_reward
-```
--- a/docs/algo/grpo.md
+++ b/docs/algo/grpo.md
-# Group Relative Policy Optimization (GRPO)
-
-In reinforcement learning, classic algorithms like PPO rely on a "critic" model to estimate the value of actions, guiding the learning process. However, training this critic model can be resource-intensive. 
-
-GRPO simplifies this process by eliminating the need for a separate critic model. Instead, it operates as follows:
- Group Sampling: For a given problem, the model generates multiple possible solutions, forming a "group" of outputs.
- Reward Assignment: Each solution is evaluated and assigned a reward based on its correctness or quality.
- Baseline Calculation: The average reward of the group serves as a baseline. 
- Policy Update: The model updates its parameters by comparing each solution's reward to the group baseline, reinforcing better-than-average solutions and discouraging worse-than-average ones.
-
-This approach reduces computational overhead by avoiding the training of a separate value estimation model, making the learning process more efficient. For more details, refer to the original paper [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://arxiv.org/pdf/2402.03300)
-
-## Key Components
-
- No Value Function (Critic-less): unlike PPO, GRPO does not train a separate value network (critic)
- Group Sampling (Grouped Rollouts): instead of evaluating one rollout per input, GRPO generates multiple completions (responses) from the current policy for each prompt. This set of completions is referred to as a group.
- Relative Rewards: within each group, completions are scored (e.g., based on correctness), and rewards are normalized relative to the group.
-
-## Configuration
-
-Note that all configs containing `micro_batch_size` are used to configure the maximum sample or token count per forward or backward pass to avoid GPU OOMs, whose value should not change algorithmic/convergence behavior.
-
-Despite that many configurations start with the `ppo_` prefix, they work across different RL algorithms in verl, as the GRPO training loop is similar to that of PPO (without critic).
-
-![image](https://github.com/user-attachments/assets/16aebad1-0da6-4eb3-806d-54a74e712c2d)
-
- `actor_rollout.ref.rollout.n`: For each prompt, sample n times. Default to 1. For GRPO, please set it to a value larger than 1 for group sampling.
-
- `data.train_batch_size`: The global batch size of prompts used to generate a set of sampled trajectories/rollouts. The number of responses/trajectories is `data.train_batch_size * actor_rollout.ref.rollout.n`
-
- `actor_rollout_ref.actor.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO actor updates. The ppo_mini_batch_size is a global size across all workers.
-
- `actor_rollout_ref.actor.ppo_epochs`: Number of epochs for GRPO updates on one set of sampled trajectories for actor
-
- `actor_rollout_ref.actor.clip_ratio`: The GRPO clip range. Default to 0.2
-
- `algorithm.adv_estimator`: Default is gae. Please set it to grpo instead
-
- `actor_rollout_ref.actor.loss_agg_mode`: Default is "token-mean". Options include "token-mean", "seq-mean-token-sum", "seq-mean-token-mean". The original GRPO paper takes the sample-level loss (seq-mean-token-mean), which may be unstable in long-CoT scenarios. All GRPO example scripts provided in verl uses the default configuration "token-mean" for loss aggregation instead.
-
-Instead of adding KL penalty in the reward, GRPO regularizes by directly adding the KL divergence between the trained policy and the reference policy to the loss:
-
- `actor_rollout_ref.actor.use_kl_loss`: To use kl loss in the actor. When used, we are not applying KL in the reward function. Default is False. Please set it to True for GRPO.
-
- `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss. Default is 0.001.
-
- `actor_rollout_ref.actor.kl_loss_type`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. How to calculate the kl divergence between actor and reference policy. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html
-
-## Advanced Extensions
-
-### DrGRPO
-
-[Understanding R1-Zero-Like Training: A Critical Perspective](https://arxiv.org/pdf/2503.20783) claims there's optimization bias in GRPO, which leads to artificially longer responses, especially for incorrect outputs. This inefficiency stems from the way GRPO calculates advantages using group-based reward normalization. Instead, DrGRPO aggregates token-level losses by normalizing with a global constant to eliminate length bias.
-
-Configure the following to enable DrGRPO, with all other parameters the same as GRPO's:
-
- `actor_rollout_ref.actor.loss_agg_mode`: "seq-mean-token-sum-norm", which turns off seq-dim averaging
- `actor_rollout_ref.actor.use_kl_loss`: Please set it to False for DrGRPO
- `algorithm.norm_adv_by_std_in_grpo`: False, which turns off standard deviation norm
-
-## Reference Example
-
-Qwen2.5 GRPO training log and commands: [link](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/qwen2-7b-fsdp2.log)
-
-```bash
-bash examples/grpo_trainer/run_qwen3-8b.sh
-```
-
-For more reference performance, please see https://verl.readthedocs.io/en/latest/algo/baseline.html
--- a/docs/algo/opo.md
+++ b/docs/algo/opo.md
-# On-Policy RL with Optimal Reward Baseline (OPO)
-
-Loose on-policy constraints and suboptimal baselines in reinforcement learning often lead to training instability such as large policy shifts and entropy collapse. OPO addresses these challenges by using exact on-policy training with the theretically optimal reward baseline for advantage estimation. It achieves lower policy shifts and higher output entropy, encouraging more diverse and less repetitive responses.
-
-OPO uses group sampling to generate multiple outputs for each input like GRPO. Unlike group-based algorithms which typically use the mean reward of a group as its baseline, OPO employs a theoretically optimal baseline: the length-weighted reward of the group. It also  omits the standard deviation normalization. By adopting these two key components, OPO enables the training of a single policy model with the objective of maximizing only the expected reward. For more detailes, refer to the original paper [On-Policy RL with Optimal Reward Baseline](https://arxiv.org/pdf/2505.23585).
-
-## Key Components
-
- Exact On-Policy Training: always generates responses from the current policy, without using any pre-generated data or off-policy data.
- Optimal Reward Baseline: uses a length-weighted reward of the group as the baseline for normalizing the rewards.
-
-## Configuration
-
-To configure OPO within the framework, use the following YAML settings. These parameters are crucial for enabling exact on-policy training and activating the optimal reward baseline.
-
-```yaml
-algorithm:
-  adv_estimator: opo  # Use OPO for optimal reward baseline 
-data:
-  train_batch_size: 1024
-actor_rollout_ref:
-  actor:
-    ppo_mini_batch_size: 1024 # ppo_mini_batch_size should equal to train_batch_size to enable exact on-policy training
-    entropy_coeff: 0 # disable entropy regularization
-    use_kl_loss: False # disable kl regularization
-    kl_loss_coef: 0 
-```
-
-## Advanced Extensions
-
-OPO can also be extended to other algorithms like RLOO and Reinforce++. It just needs to adjust their configurations to enable exact on-policy training and incorporate the optimal length-weighted reward baseline with minimal modifications to their advantage estimation functions.
--- a/docs/algo/ppo.md
+++ b/docs/algo/ppo.md
-# Proximal Policy Optimization (PPO)
-
-Proximal Policy Optimization (PPO) is a family of policy gradient methods for reinforcement learning, proposed by OpenAI in 2017. PPO strikes a balance between simplicity, stability, and performance, making it one of the most widely used algorithms in modern RL applications, including large-scale language model fine-tuning.
-
-Traditional policy gradient methods like REINFORCE or Vanilla Policy Gradient suffer from:
-
- High variance and sample inefficiency.
- Instability due to large policy updates.
-
-PPO addresses this problem using a clipped surrogate objective that avoids overly large updates without requiring second-order derivatives.
-
-For more technical details regarding PPO, we suggest reading the introduction in the [OpenAI spinning up tutorial](https://spinningup.openai.com/en/latest/algorithms/ppo.html), and the paper [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347).
-
-## Key Components
-
- Actor-Critic Architecture: PPO requires both an actor model (policy) and a critic model (value function). This differs from other algorithms like GRPO and RLOO that don't require a critic model.
-
- Generalized Advantage Estimation (GAE): PPO uses GAE for computing advantage values, which helps reduce variance in policy gradient estimates while maintaining low bias.
-
- Clipped Surrogate Objective: The core of PPO is implemented through the clipped surrogate objective function that limits policy updates.
-
-## Configuration
-
-Note that all configs containing `micro_batch_size` are used to configure the maximum sample or token count per forward or backward pass to avoid GPU OOMs, whose value should not change algorithmic/convergence behavior.
-
-Most critic configs are similar to those of actors. Note that the critic model is omitted from the figure below.
-
-![image](https://github.com/user-attachments/assets/16aebad1-0da6-4eb3-806d-54a74e712c2d)
-
- `data.train_batch_size`: The global batch size of prompts used to generate a set of sampled trajectories/rollouts. The number of responses/trajectories is `data.train_batch_size * actor_rollout.ref.rollout.n`
-
- `actor_rollout_ref.actor.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO actor updates. The ppo_mini_batch_size is a global size across all workers
-
- `actor_rollout_ref.critic.ppo_mini_batch_size`: The set of sampled trajectories is split into multiple mini-batches with batch_size=ppo_mini_batch_size for PPO critic updates. The ppo_mini_batch_size is a global size across all workers
-
- `actor_rollout_ref.actor.clip_ratio`: The PPO clip range. Default to 0.2
-
- `actor_rollout_ref.actor.ppo_epochs`: Number of epochs for PPO updates on one set of sampled trajectories for actor
-
- `actor_rollout_ref.actor.ppo_epochs`: Number of epochs for PPO updates on one set of sampled trajectories for critic
-
- `algorithm.gemma`: discount factor
-
- `algorithm.lam`: The lambda term that trades off between bias and variance in the GAE estimator
-
- `algorithm.adv_estimator`: Support gae, grpo, reinforce_plus_plus, reinforce_plus_plus_baseline, rloo
-
-## Advanced Extensions
-
-### KL Divergence Control
-
-Options to prevent the policy from diverging too far from a reference policy. Two mechanisms are available: KL reward penalty and KL loss. For more technical details, see [Training language models to follow instructions with human feedback](https://arxiv.org/abs/2203.02155)
-
-Options to use KL loss for KL divergence control: 
-
- `actor_rollout_ref.actor.use_kl_loss`: to use kl loss in the actor. When used, we are not applying KL in the reward function. Default is False
-
- `actor_rollout_ref.actor.kl_loss_coef`: The coefficient of kl loss. Default is 0.001.
-
- `actor_rollout_ref.actor.kl_loss_type`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. How to calculate the kl divergence between actor and reference policy. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html
-
-Options to use KL penalty in the reward:
-
- `algorithm.use_kl_in_reward`: Whether to enable in-reward kl penalty. Default is False.
-
- `algorithm.kl_penalty`: Support kl(k1), abs, mse(k2), low_var_kl(k3) and full. This defines the way to calculate the kl divergence between actor and reference policy. For specific options, refer to `kl_penalty` in core_algos.py. See this blog post for detailed analysis: http://joschu.net/blog/kl-approx.html
-
- `algorithm.kl_ctrl.kl_coef`: The (initial) coefficient of in-reward kl_penalty. Default is 0.001.
- `algorithm.kl_ctrl.type`: 'fixed' for FixedKLController and 'adaptive' for AdaptiveKLController.
- `algorithm.kl_ctrl.horizon`: See source code of AdaptiveKLController for details.
- `algorithm.kl_ctrl.target_kl`: See source code of AdaptiveKLController for details.
-
-### Dual-clip PPO
-
-The Dual-Clip PPO introduces a approach by applying a lower bound to the policy ratio when the advantage is less than zero, when multiplied by a large raito, does not exceed a specified lower bound.
-
-![image](https://github.com/user-attachments/assets/fc232181-d8b0-4307-8dd2-4dc0a4c1c139)
-
- `actor_rollout_ref.actor.clip_ratio_c`: lower bound of the value for Dual-clip PPO, defaults to 3.0
-
-## Reference Example
-
-Qwen2.5 training log and commands: [link](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log)
-
-```bash
-bash run_gemma.sh
-  trainer.n_gpus_per_node=1 \
-  actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
-  trainer.logger=['console'] \
-  critic.model.path=Qwen/Qwen2.5-0.5B-Instruct \
-  actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
-  data.train_batch_size=256 \
-  actor_rollout_ref.actor.ppo_mini_batch_size=64 \
-  actor_rollout_ref.actor.ppo_micro_batch_size=2 \
-  critic.ppo_micro_batch_size=2
-```
-
-Reference performance with verl v0.2:
-
-| Model                          | Method          | Score | Link                                                                                           |
-|-------------------------------|------------------|-------|------------------------------------------------------------------------------------------------|
-| Qwen/Qwen2.5-0.5B-Instruct     | pretrained model | 36.4  | [Qwen Blog](https://qwenlm.github.io/blog/qwen2.5-llm/)                                        |
-| Qwen/Qwen2.5-0.5B-Instruct     | PPO              | 56.7  | [PPO Command and Logs](https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log) |
--- a/docs/algo/spin.md
+++ b/docs/algo/spin.md
--- a/docs/algo/sppo.md
+++ b/docs/algo/sppo.md
-# Recipe: Self-Play Preference Optimization (SPPO)
-
-verl provides a community recipe implementation for the paper [Self-Play Preference Optimization for Language Model Alignment](https://arxiv.org/abs/2405.00675). SPPO can significantly enhance the performance of an LLM without strong external signals such as responses or preferences from GPT-4. It can outperform the model trained with iterative direct preference optimization (DPO), among other methods. SPPO is theoretically grounded, ensuring that the LLM can converge to the von Neumann winner (i.e., Nash equilibrium) under general, potentially intransitive preference, and empirically validated through extensive evaluations on multiple datasets.
-
-Paper Authors: [Yue Wu](https://yuewu.us/)\*, [Zhiqing Sun](https://www.cs.cmu.edu/~zhiqings/)\*, [Huizhuo Yuan](https://scholar.google.com/citations?user=8foZzX4AAAAJ)\*, [Kaixuan Ji](https://scholar.google.com/citations?user=FOoKDukAAAAJ), [Yiming Yang](https://www.cs.cmu.edu/~yiming/), [Quanquan Gu](https://web.cs.ucla.edu/~qgu/)
-
-verl Implementation Authors: [Yuhao Yang](https://github.com/yhyang201), [Chenyang Zhao](https://github.com/zhaochenyang20)
-
-[[Webpage](https://uclaml.github.io/SPPO/)] [[Huggingface](https://huggingface.co/papers/2405.00675)] [[Paper](https://arxiv.org/abs/2405.00675)][[Original Implementation](https://github.com/uclaml/SPPO)]
-
-## Reproduce the Experiment
-
-We evaluate the performance of SPPO on the MATH dataset. Starting from an initial score of 46.6 with Qwen2.5-7B-Instruct, we achieve a score of 65.6 after 20 epochs of training, placing our model approximately in the top 20 on the [MATH leaderboard](https://paperswithcode.com/sota/math-word-problem-solving-on-math). It's important to note that verl's internal evaluation metrics may not perfectly align with the official evaluation methodology for Qwen2.5-7B-Instruct. Therefore, for consistency and fair comparison, we report only the results based on verl's evaluation framework.
-
-```
-git clone git@github.com:volcengine/verl.git
-cd verl
-python3 -m uv pip install -e ".[sglang]"
-
-export WANDB_API_KEY=<YOUR_WANDB_API_KEY>
-
-python3 examples/data_preprocess/math_dataset.py --local_dir ~/data/math
-huggingface-cli download Qwen/Qwen2.5-7B-Instruct --local-dir $HOME/models/Qwen2.5-7B-Instruct
-
-export CUDA_VISIBLE_DEVICES=0,1,2,3
-bash recipe/sppo/run_qwen2.5-7b_rm.sh
-```
-
-Note that the installation would occasionally fail to install flash-attn. If this happens, you can install it manually by running:
-
-```bash
-python3 -m uv pip install wheel
-python3 -m uv pip install packaging
-python3 -m uv pip install flash-attn --no-build-isolation --no-deps
-```
-
-## Acknowledgement
-
-We sincerely thank the contribution and guidance from:
-
- [Yue Wu](https://yuewu.us/)
- [Chendong Wang](https://cdwang96.github.io/)
- [Yifan Zhang](https://github.com/yifanzhang-pro)
- [Yongan Xiang](https://github.com/BearBiscuit05)
- [Junrong Lin](https://github.com/ocss884)
- [Yuxuan Tong](https://github.com/tongyx361)
- [Guangming Shen](https://github.com/PeterSH6)
- [Biao He](https://www.linkedin.com/in/biao-he/)
- [Qingquan Song](https://qingquansong.github.io/)
- [Quanquan Gu](https://web.cs.ucla.edu/~qgu/)
--- a/docs/amd_tutorial/amd_build_dockerfile.md
+++ b/docs/amd_tutorial/amd_build_dockerfile.md
-# Setup 
-
-## Dockerfile.rocm 
-```bash
-#  Build the docker in the repo dir:
-# docker build -f docker/Dockerfile.rocm -t verl-rocm:03.04.2015 .
-# docker images # you can find your built docker
-# 
-FROM rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4
-
-# Set working directory
-# WORKDIR $PWD/app
-
-# Set environment variables
-ENV PYTORCH_ROCM_ARCH="gfx90a;gfx942"
-
-# Install vllm
-RUN pip uninstall -y vllm && \
-    rm -rf vllm && \
-    git clone -b v0.6.3 https://github.com/vllm-project/vllm.git && \
-    cd vllm && \
-    MAX_JOBS=$(nproc) python3 setup.py install && \
-    cd .. && \
-    rm -rf vllm
-
-# Copy the entire project directory
-COPY . .
-
-# Install dependencies
-RUN pip install "tensordict<0.6" --no-deps && \
-    pip install accelerate \
-    codetiming \
-    datasets \
-    dill \
-    hydra-core \
-    liger-kernel \
-    numpy \
-    pandas \
-    peft \
-    "pyarrow>=15.0.0" \
-    pylatexenc \
-    "ray[data,train,tune,serve]" \
-    torchdata \
-    transformers \
-    wandb \
-    orjson \
-    pybind11 && \
-    pip install -e . --no-deps
-```
-
-
-## Build the image:
-```bash
-docker build -t verl-rocm .
-```
-
-## Run the container
-```bash
-docker run --rm -it \
-  --device /dev/dri \
-  --device /dev/kfd \
-  -p 8265:8265 \
-  --group-add video \
-  --cap-add SYS_PTRACE \
-  --security-opt seccomp=unconfined \
-  --privileged \
-  -v $HOME/.ssh:/root/.ssh \
-  -v $HOME:$HOME \
-  --shm-size 128G \
-  -w $PWD \
-  verl-rocm \
-  /bin/bash
-```
-
-
-# Example
-
-## PPO
-```bash
-YOUR_PROJECT_NAME=r1-verl-ppo-upstream
-YOUR_RUN_NAME=r1-training_ppo-upstream 
-# export HYDRA_FULL_ERROR=1
-export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-export ROCR_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
-GPUS_PER_NODE=8
-MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct
-python3 examples/data_preprocess/gsm8k.py --local_dir data/gsm8k
-python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')"
-PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
- data.train_files=data/gsm8k/train.parquet \
- data.val_files=data/gsm8k/test.parquet \
- data.train_batch_size=256 \
- data.val_batch_size=1312 \
- data.max_prompt_length=512 \
- data.max_response_length=256 \
- actor_rollout_ref.model.path=$MODEL_PATH \
- actor_rollout_ref.actor.optim.lr=1e-6 \
- actor_rollout_ref.actor.ppo_mini_batch_size=64 \
- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
- actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
- actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
- critic.optim.lr=1e-5 \
- critic.model.path=$MODEL_PATH \
- critic.ppo_micro_batch_size_per_gpu=4 \
- algorithm.kl_ctrl.kl_coef=0.001 \
- trainer.logger=['console'] \
- trainer.project_name=$YOUR_PROJECT_NAME \
- trainer.experiment_name=$YOUR_RUN_NAME \
- +trainer.val_before_train=False \
- trainer.default_hdfs_dir=null \
- trainer.n_gpus_per_node=$GPUS_PER_NODE \
- trainer.nnodes=1 \
- trainer.save_freq=10 \
- trainer.test_freq=10 \
- trainer.total_epochs=15 #2>&1 | tee verl_demo.log
-```
-
-
-## GRPO
-```bash
-YOUR_PROJECT_NAME=r1-verl-grpo-upstream
-YOUR_RUN_NAME=r1-training_grpo-upstream
-# export HYDRA_FULL_ERROR=1
-# export FSDP_VERBOSE=1 
-export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-export ROCR_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
-GPUS_PER_NODE=8
-MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct
-# MODEL_PATH=Qwen/Qwen2-7B-Instruct
-python3 examples/data_preprocess/gsm8k.py --local_dir data/gsm8k
-python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')"
-python3 -m verl.trainer.main_ppo \
-    algorithm.adv_estimator=grpo \
-    data.train_files=data/gsm8k/train.parquet \
-    data.val_files=data/gsm8k/test.parquet \
-    data.train_batch_size=1024 \
-    data.val_batch_size=1312 \
-    data.max_prompt_length=512 \
-    data.max_response_length=1024 \
-    actor_rollout_ref.model.path=$MODEL_PATH \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
-    actor_rollout_ref.actor.use_dynamic_bsz=True \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \
-    actor_rollout_ref.actor.use_kl_loss=True \
-    actor_rollout_ref.actor.kl_loss_coef=0.001 \
-    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
-    actor_rollout_ref.model.enable_gradient_checkpointing=Flase \
-    actor_rollout_ref.actor.fsdp_config.param_offload=False \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
-    actor_rollout_ref.rollout.name=vllm \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
-    actor_rollout_ref.rollout.n=5 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=False \
-    algorithm.kl_ctrl.kl_coef=0.001 \
-    trainer.critic_warmup=0 \
-    trainer.logger=['console'] \
-    trainer.project_name=$YOUR_PROJECT_NAME \
-    trainer.experiment_name=$YOUR_RUN_NAME \
-    trainer.n_gpus_per_node=$GPUS_PER_NODE \
-    +trainer.val_before_train=False \
-    trainer.nnodes=1 \
-    trainer.save_freq=-1 \
-    trainer.test_freq=10 \
-    trainer.total_epochs=15
-```
\ No newline at end of file
--- a/docs/amd_tutorial/amd_build_dockerfile_page.rst
+++ b/docs/amd_tutorial/amd_build_dockerfile_page.rst
--- a/docs/amd_tutorial/amd_existing_docker.md
+++ b/docs/amd_tutorial/amd_existing_docker.md
-# Setup
-
-## Docker:
-Find the docker here: https://hub.docker.com/r/rocm/vllm/tags (rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4)
-```bash
-docker run --rm -it \
-  --device /dev/dri \
-  --device /dev/kfd \
-  --network host \
-  --ipc host \
-  --group-add video \
-  --cap-add SYS_PTRACE \
-  --security-opt seccomp=unconfined \
-  --privileged \
-  -v /home/yushensu:/home/yushensu \
-  -v $HOME/.ssh:/root/.ssh \
-  --shm-size 128G \
-  --name verl_vllm_upstream \
-  -w $PWD \
-  rocm/vllm:rocm6.2_mi300_ubuntu20.04_py3.9_vllm_0.6.4 \
-  /bin/bash
-```
-
-## Build ROCM vLLM:
-```bash
-pip uninstall -y vllm
-git clone -b v0.6.3 https://github.com/vllm-project/vllm.git
-cd vllm 
-export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
-export MAX_JOBS=$(nproc)
-# python3 setup.py develop # will not create src need to keep the repo
-python3 setup.py install # will add src into py. You can delete the repo
-cd ..
-rm -rf vllm 
-```
-
-## Install the require packages:
-```bash
-pip install "tensordict<0.6" --no-deps
-
-pip install accelerate \
-    codetiming \
-    datasets \
-    dill \
-    hydra-core \
-    liger-kernel \
-    numpy \
-    pandas \
-    peft \
-    "pyarrow>=15.0.0" \
-    pylatexenc \
-    "ray[data,train,tune,serve]" \
-    torchdata \
-    transformers \
-    wandb \
-    orjson \
-    pybind11
-
-pip install -e . --no-deps
-```
-
-
-# Example
-
-## PPO
-```bash
-YOUR_PROJECT_NAME=r1-verl-ppo-upstream
-YOUR_RUN_NAME=r1-training_ppo-upstream 
-# export HYDRA_FULL_ERROR=1
-export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-export ROCR_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
-GPUS_PER_NODE=8
-MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct
-python3 examples/data_preprocess/gsm8k.py --local_dir data/gsm8k
-python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')"
-PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \
- data.train_files=data/gsm8k/train.parquet \
- data.val_files=data/gsm8k/test.parquet \
- data.train_batch_size=256 \
- data.val_batch_size=1312 \
- data.max_prompt_length=512 \
- data.max_response_length=256 \
- actor_rollout_ref.model.path=$MODEL_PATH \
- actor_rollout_ref.actor.optim.lr=1e-6 \
- actor_rollout_ref.actor.ppo_mini_batch_size=64 \
- actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=4 \
- actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=8 \
- actor_rollout_ref.rollout.tensor_model_parallel_size=1 \
- actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
- actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=4 \
- critic.optim.lr=1e-5 \
- critic.model.path=$MODEL_PATH \
- critic.ppo_micro_batch_size_per_gpu=4 \
- algorithm.kl_ctrl.kl_coef=0.001 \
- trainer.logger=['console'] \
- trainer.project_name=$YOUR_PROJECT_NAME \
- trainer.experiment_name=$YOUR_RUN_NAME \
- +trainer.val_before_train=False \
- trainer.default_hdfs_dir=null \
- trainer.n_gpus_per_node=$GPUS_PER_NODE \
- trainer.nnodes=1 \
- trainer.save_freq=10 \
- trainer.test_freq=10 \
- trainer.total_epochs=15 #2>&1 | tee verl_demo.log
-```
-
-
-## GRPO
-```bash
-YOUR_PROJECT_NAME=r1-verl-grpo-upstream
-YOUR_RUN_NAME=r1-training_grpo-upstream
-# export HYDRA_FULL_ERROR=1
-# export FSDP_VERBOSE=1 
-export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-export ROCR_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
-GPUS_PER_NODE=8
-MODEL_PATH=Qwen/Qwen2.5-0.5B-Instruct
-# MODEL_PATH=Qwen/Qwen2-7B-Instruct
-python3 examples/data_preprocess/gsm8k.py --local_dir data/gsm8k
-python3 -c "import transformers; transformers.pipeline('text-generation', model='$MODEL_PATH')"
-python3 -m verl.trainer.main_ppo \
-    algorithm.adv_estimator=grpo \
-    data.train_files=data/gsm8k/train.parquet \
-    data.val_files=data/gsm8k/test.parquet \
-    data.train_batch_size=1024 \
-    data.val_batch_size=1312 \
-    data.max_prompt_length=512 \
-    data.max_response_length=1024 \
-    actor_rollout_ref.model.path=$MODEL_PATH \
-    actor_rollout_ref.actor.optim.lr=1e-6 \
-    actor_rollout_ref.model.use_remove_padding=True \
-    actor_rollout_ref.actor.ppo_mini_batch_size=256 \
-    actor_rollout_ref.actor.use_dynamic_bsz=True \
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=24000 \
-    actor_rollout_ref.actor.use_kl_loss=True \
-    actor_rollout_ref.actor.kl_loss_coef=0.001 \
-    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
-    actor_rollout_ref.model.enable_gradient_checkpointing=Flase \
-    actor_rollout_ref.actor.fsdp_config.param_offload=False \
-    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
-    actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
-    actor_rollout_ref.rollout.name=vllm \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
-    actor_rollout_ref.rollout.n=5 \
-    actor_rollout_ref.ref.fsdp_config.param_offload=False \
-    algorithm.kl_ctrl.kl_coef=0.001 \
-    trainer.critic_warmup=0 \
-    trainer.logger=['console'] \
-    trainer.project_name=$YOUR_PROJECT_NAME \
-    trainer.experiment_name=$YOUR_RUN_NAME \
-    trainer.n_gpus_per_node=$GPUS_PER_NODE \
-    +trainer.val_before_train=False \
-    trainer.nnodes=1 \
-    trainer.save_freq=-1 \
-    trainer.test_freq=10 \
-    trainer.total_epochs=15
-```
\ No newline at end of file
--- a/docs/amd_tutorial/amd_vllm_page.rst
+++ b/docs/amd_tutorial/amd_vllm_page.rst
-verl performance tuning for AMD (ROCm Kernel)
-=====================================================
-
-Author: `Yang Wang <https://github.com/YangWang92/>`_
-
-Patch vLLM to Enable Sleep Mode for AMD GPUs
--------------------------------------------------------------
-
-By default, verl requires vLLM to enable sleep mode, which allows vLLM to offload GPU memory to CPU memory after rollout. However, this feature is still under review by the vLLM community.
-
-To enable vLLM's sleep mode, you can first use community patched code (from `this pull request <https://github.com/vllm-project/vllm/pull/12695>`_) to build vLLM from the source code in the corresponding pull request. After the patch merged in vLLM main branch, you can directly install vLLM from the latest version.
-
-1. Clone the vLLM repository and build it with the following commands:
-
-.. code-block:: bash
-
-    git clone -b sleep_amd https://github.com/HollowMan6/vllm.git
-    cd vllm
-    sudo ln -sf /opt/rocm/lib/libamdhip64.so /usr/lib/libamdhip64.so
-    VLLM_TARGET_DEVICE=rocm ROCM_PATH=/opt/rocm/ VLLM_GPU_LANG=HIP SETUPTOOLS_SCM_PRETEND_VERSION=0.8.4.dev python3 setup.py develop
-
-2. Additionally, make sure to use the ROCm version in your Docker image lager than or equal to ROCm 6.3.4, and we recommend to use ROCm 6.4.0 for better performance (see `this comment <https://github.com/vllm-project/vllm/pull/12695#issuecomment-2637839574>`_).
-
-After the upgrade, you can verify whether sleep mode is enabled by running the following test code (from `this comment <https://github.com/vllm-project/vllm/pull/12695#issuecomment-2637839574>`_).
-
-.. code-block:: python
-
-	import torch
-	from vllm import LLM
-
-	llm = LLM(model="meta-llama/Llama-3.1-8B-Instruct", enable_sleep_mode=True)
-
-	def run_inference(prompt):
-		outputs = llm.generate(prompt)
-		for output in outputs:
-			prompt = output.prompt
-			generated_text = output.outputs[0].text
-			print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-
-
-	print("CUDA Memory Usage (after inference):")
-	torch.cuda.empty_cache()
-	print(f"{torch.cuda.memory_allocated()=}")
-
-	run_inference("San Francisco is")
-	llm.sleep()
-
-	print("CUDA Memory Usage (after sleep):")
-	torch.cuda.empty_cache()
-	print(f"{torch.cuda.memory_allocated()=}")
-
-	llm.wake_up()
-
-	print("CUDA Memory Usage (after wakeup):")
-	torch.cuda.empty_cache()
-	print(f"{torch.cuda.memory_allocated()=}")
-
-	run_inference("Paris is")
-
-If sleep mode is enabled, you should see the memory usage reduce after sleep.
-
-After applying the vLLM patch and completing the installation, you can enable sleep mode in verl to reduce memory overhead. This allows verl to offload unused GPU memory during rollout, significantly lowering the memory footprint during long-context training or multi-node reinforcement learning.
-
-
-Enable CUDA Graph and Bypass ROCm-related issues
--------------------------------------------------------------
-
-Due to potential issues with CUDA graph capture in ROCm, we’ve found that vLLM’s CUDA graph feature cannot be enabled on multiple nodes in verl on AMD platforms with vLLM V1 mode. This leads to significantly slower rollout performance.
-
-Our investigation shows that ROCm may trigger an unexpected crash when attempting to capture large batches with CUDA graph. One workaround is to patch the LLM configuration (from `this commit <https://github.com/volcengine/verl/blob/v0.3.0.rc0/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py#L100-L115>`_).
-
-.. code-block:: python
-	
-    self.inference_engine = LLM(
-        model=model_path,
-        enable_sleep_mode=True,
-        tensor_parallel_size=tensor_parallel_size,
-        distributed_executor_backend="external_launcher",
-        dtype=config.dtype,
-        enforce_eager=config.enforce_eager,
-        gpu_memory_utilization=config.gpu_memory_utilization,
-        disable_custom_all_reduce=True,
-        disable_mm_preprocessor_cache=True,
-        limit_mm_per_prompt=limit_mm_per_prompt,
-        skip_tokenizer_init=False,
-        max_model_len=max_model_len,
-        load_format=load_format,
-        disable_log_stats=config.disable_log_stats,
-        max_num_batched_tokens=max_num_batched_tokens,
-        enable_chunked_prefill=config.enable_chunked_prefill,
-        enable_prefix_caching=True,
-        trust_remote_code=trust_remote_code,
-        # enable compilation config to bypass oom on rocm
-	# change depends on your GPU memory size
-        compilation_config={"cudagraph_capture_sizes": [1, 2, 4, 8, 16, 32, 64]},
-        seed=config.get('seed', 0),
-    )
-
-Then, you can enable CUDA graph by setting the following environment variables (see `this page <https://github.com/volcengine/verl/blob/v0.3.0.rc0/docs/README_vllm0.8.md>`_):
-
-.. code-block:: bash
-
-	actor_rollout_ref.rollout.enforce_eager=False \
-	actor_rollout_ref.rollout.free_cache_engine=False \
--- a/docs/api/data.rst
+++ b/docs/api/data.rst
-Data interface
-=========================
-
-DataProto is the interface for data exchange.
-
-The :class:`verl.DataProto` class contains two key members:
-
- batch: a :class:`tensordict.TensorDict` object for the actual data
- meta_info: a :class:`Dict` with additional meta information
-
-TensorDict
-~~~~~~~~~~~~
-
-:attr:`DataProto.batch` is built on top of :class:`tensordict`, a project in the PyTorch ecosystem.
-A TensorDict is a dict-like container for tensors. To instantiate a TensorDict, you must specify key-value pairs as well as the batch size.
-
-.. code-block:: python
-
-    >>> import torch
-    >>> from tensordict import TensorDict
-    >>> tensordict = TensorDict({"zeros": torch.zeros(2, 3, 4), "ones": torch.ones(2, 3, 5)}, batch_size=[2,])
-    >>> tensordict["twos"] = 2 * torch.ones(2, 5, 6)
-    >>> zeros = tensordict["zeros"]
-    >>> tensordict
-    TensorDict(
-    fields={
-        ones: Tensor(shape=torch.Size([2, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False),
-        twos: Tensor(shape=torch.Size([2, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False),
-        zeros: Tensor(shape=torch.Size([2, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)},
-    batch_size=torch.Size([2]),
-    device=None,
-    is_shared=False)
-
-One can also index a tensordict along its batch_size. The contents of the TensorDict can be manipulated collectively as well.
-
-.. code-block:: python
-
-    >>> tensordict[..., :1]
-    TensorDict(
-    fields={
-        ones: Tensor(shape=torch.Size([1, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False),
-        twos: Tensor(shape=torch.Size([1, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False),
-        zeros: Tensor(shape=torch.Size([1, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)},
-    batch_size=torch.Size([1]),
-    device=None,
-    is_shared=False)
-    >>> tensordict = tensordict.to("cuda:0")
-    >>> tensordict = tensordict.reshape(6)
-
-For more about :class:`tensordict.TensorDict` usage, see the official tensordict_ documentation.
-
-.. _tensordict: https://pytorch.org/tensordict/overview.html
-
-
-Core APIs
-~~~~~~~~~~~~~~~~~
-
-.. autoclass::  verl.DataProto
-   :members: to, select, union, make_iterator, concat
--- a/docs/api/single_controller.rst
+++ b/docs/api/single_controller.rst
-Single Controller interface
-============================
-
-The Single Controller provides a unified interface for managing distributed workers
-using Ray or other backends and executing functions across them.
-It simplifies the process of dispatching tasks and collecting results, particularly 
-when dealing with data parallelism or model parallelism. 
-
-
-Core APIs
-~~~~~~~~~~~~~~~~~
-
-.. autoclass:: verl.single_controller.Worker
-   :members: __init__, __new__, get_master_addr_port, get_cuda_visible_devices, world_size, rank
-
-.. autoclass:: verl.single_controller.WorkerGroup
-   :members: __init__,  world_size
-
-.. autoclass:: verl.single_controller.ClassWithInitArgs
-   :members: __init__, __call__
-
-.. autoclass:: verl.single_controller.ResourcePool
-   :members: __init__, world_size, local_world_size_list, local_rank_list
-
-.. autoclass:: verl.single_controller.ray.RayWorkerGroup
-   :members: __init__
-
-.. autofunction:: verl.single_controller.ray.create_colocated_worker_cls
\ No newline at end of file
--- a/docs/api/trainer.rst
+++ b/docs/api/trainer.rst
-Trainer Interface
-================================
-
-Trainers drive the training loop. Introducing new trainer classes in case of new training paradiam is encouraged.
-
-.. autosummary::
-   :nosignatures:
-
-   verl.trainer.ppo.ray_trainer.RayPPOTrainer
-
-
-Core APIs
-~~~~~~~~~~~~~~~~~
-
-.. autoclass:: verl.trainer.ppo.ray_trainer.RayPPOTrainer
-   :members: __init__, init_workers, fit
-
-.. automodule:: verl.utils.tokenizer
-   :members: hf_tokenizer
-
-.. automodule:: verl.trainer.ppo.core_algos
-   :members: agg_loss, kl_penalty, compute_policy_loss, kl_penalty
-
-.. automodule:: verl.trainer.ppo.reward
-   :members: load_reward_manager, compute_reward, compute_reward_async
-
-.. autoclass:: verl.workers.reward_manager.NaiveRewardManager
-
-.. autoclass:: verl.workers.reward_manager.DAPORewardManager
--- a/docs/api/utils.rst
+++ b/docs/api/utils.rst
-Utilities
-============
-
-This section documents the utility functions and classes in the VERL library.
-
-Python Functional Utilities
------------------------------
-
-.. automodule:: verl.utils.py_functional
-   :members: append_to_dict
-
-File System Utilities
------------------------
-
-.. automodule:: verl.utils.fs
-   :members: copy_to_local
-
-Tracking Utilities
---------------------
-
-.. automodule:: verl.utils.tracking
-   :members: Tracking
-
-Metrics Utilities
---------------------
-
-.. automodule::  verl.utils.metric
-   :members: reduce_metrics
-
-Checkpoint Management
------------------------
-
-.. automodule:: verl.utils.checkpoint.checkpoint_manager
-   :members: find_latest_ckpt_path
-
-.. automodule:: verl.utils.checkpoint.fsdp_checkpoint_manager
-   :members: FSDPCheckpointManager
-
-Dataset Utilities
---------------------
-
-.. automodule:: verl.utils.dataset.rl_dataset
-   :members: RLHFDataset, collate_fn
-
-Torch Functional Utilities
-----------------------------
-
-.. automodule:: verl.utils.torch_functional
-   :members: get_constant_schedule_with_warmup, masked_whiten, masked_mean, logprobs_from_logits
-
-Sequence Length Balancing
----------------------------
-
-.. automodule:: verl.utils.seqlen_balancing
-   :members: get_reverse_idx, rearrange_micro_batches
-
-Ulysses Utilities
--------------------
-
-.. automodule:: verl.utils.ulysses
-   :members: gather_outpus_and_unpad, ulysses_pad_and_slice_inputs
-
-FSDP Utilities
------------------
-
-.. automodule:: verl.utils.fsdp_utils
-   :members: get_fsdp_wrap_policy, get_init_weight_context_manager, init_fn, load_fsdp_model_to_gpu, load_fsdp_optimizer, offload_fsdp_model_to_cpu, offload_fsdp_optimizer,
-
-Debug Utilities
-------------------
-
-.. automodule:: verl.utils.debug
-   :members: log_gpu_memory_usage, GPUMemoryLogger
-
--- a/docs/ascend_tutorial/ascend_quick_start.rst
+++ b/docs/ascend_tutorial/ascend_quick_start.rst
-verl x Ascend
-===================================
-
-
-我们在 verl 上增加对华为昇腾设备的支持。
-
-硬件支持
-----------------------------------
-
-Atlas 200T A2 Box16
-
-Atlas 800T A2
-
-
-安装
-----------------------------------
-
-基础环境准备
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-+-----------+-------------+
-| software  | version     |
-+-----------+-------------+
-| Python    | == 3.10     |
-+-----------+-------------+
-| CANN      | == 8.1.RC1  |
-+-----------+-------------+
-| torch     | == 2.5.1    |
-+-----------+-------------+
-| torch_npu | == 2.5.1.RC1|
-+-----------+-------------+
-
-
-vllm & vllm-ascend
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-为了能够在 verl 中正常使用 vllm，需使用以下命令编译安装 vllm 和 vllm-ascend。请注意根据机器类型区分安装方式。
-
-.. code-block:: bash
-    
-    # vllm
-    git clone -b v0.7.3 --depth 1 https://github.com/vllm-project/vllm.git
-    cd vllm
-    pip install -r requirements-build.txt
-
-    # for Atlas 200T A2 Box16
-    VLLM_TARGET_DEVICE=empty pip install -e . --extra-index https://download.pytorch.org/whl/cpu/
-    
-    # for Atlas 800T A2
-    VLLM_TARGET_DEVICE=empty pip install -e .
-
-.. code-block:: bash
-    
-    # vllm-ascend
-    git clone -b v0.7.3 --depth 1 https://github.com/vllm-project/vllm-ascend.git
-    cd vllm-ascend
-    export COMPILE_CUSTOM_KERNELS=1
-    python setup.py install
-
-安装verl
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-.. code-block:: bash
-
-    git clone https://github.com/volcengine/verl.git
-    cd verl
-    pip install -r requirements-npu.txt
-    pip install -e .
-
-其他三方库说明
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-+--------------+---------------+
-| software     | description   |
-+--------------+---------------+
-| transformers | >= v4.52.0    |
-+--------------+---------------+
-| flash_attn   | not supported |
-+--------------+---------------+
-| liger-kernel | not supported |
-+--------------+---------------+
-| tensordict   | 0.8.3 (ARM)   |
-+--------------+---------------+
-
-1. 支持通过 transformers 使能 --flash_attention_2， transformers 需大于等于 4.52.0版本。
-2. 不支持通过 flash_attn 使能 flash attention 加速。
-3. 不支持 liger-kernel 使能。
-4. 针对 ARM 服务器，tensordict 要求 0.8.3，可在依赖安装完成后再手动安装 tensordict。
-
-
-快速开始
-----------------------------------
-正式使用前，建议您通过对Qwen2.5-0.5B GRPO的训练尝试以检验环境准备和安装的正确性。
-
-1.下载数据集并将数据集预处理为parquet格式，以便包含计算RL奖励所需的必要字段
-
-.. code-block:: bash
-
-    python3 examples/data_preprocess/gsm8k.py --local_dir ~/data/gsm8k
-
-2.执行训练
-
-.. code-block:: bash
-
-    set -x
-
-    export VLLM_ATTENTION_BACKEND=XFORMERS
-
-    python3 -m verl.trainer.main_ppo \
-        algorithm.adv_estimator=grpo \
-        data.train_files=$HOME/data/gsm8k/train.parquet \
-        data.val_files=$HOME/data/gsm8k/test.parquet \
-        data.train_batch_size=128 \
-        data.max_prompt_length=512 \
-        data.max_response_length=128 \
-        data.filter_overlong_prompts=True \
-        data.truncation='error' \
-        actor_rollout_ref.model.path=Qwen/Qwen2.5-0.5B-Instruct \
-        actor_rollout_ref.actor.optim.lr=5e-7 \
-        actor_rollout_ref.model.use_remove_padding=False \
-        actor_rollout_ref.actor.entropy_coeff=0.001 \
-        actor_rollout_ref.actor.ppo_mini_batch_size=64 \
-        actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=20 \
-        actor_rollout_ref.actor.use_kl_loss=True \
-        actor_rollout_ref.actor.kl_loss_coef=0.001 \
-        actor_rollout_ref.actor.kl_loss_type=low_var_kl \
-        actor_rollout_ref.model.enable_gradient_checkpointing=True \
-        actor_rollout_ref.actor.fsdp_config.param_offload=False \
-        actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
-        actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=40 \
-        actor_rollout_ref.rollout.enable_chunked_prefill=False \
-        actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
-        actor_rollout_ref.rollout.name=vllm \
-        actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
-        actor_rollout_ref.rollout.n=5 \
-        actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=40 \
-        actor_rollout_ref.ref.fsdp_config.param_offload=True \
-        algorithm.kl_ctrl.kl_coef=0.001 \
-        trainer.critic_warmup=0 \
-        trainer.logger=['console'] \
-        trainer.project_name='verl_grpo_example_gsm8k' \
-        trainer.experiment_name='qwen2_7b_function_rm' \
-        trainer.n_gpus_per_node=8 \
-        trainer.nnodes=1 \
-        trainer.save_freq=-1 \
-        trainer.test_freq=5 \
-        trainer.total_epochs=1 \
-        trainer.device=npu $@
-
-
-支持现状
-----------------------------------
-
-+-----------+----------------------+-------------+-------------------+----------------------+
-| algorithm |         model        | rewards mae |  throughput ratio |        hardware      |
-+-----------+----------------------+-------------+-------------------+----------------------+
-|   GRPO    | Qwen2.5-7B-instruct  |    0.38%    |        0.588      |  Atlas 200T A2 Box16 |
-+-----------+----------------------+-------------+-------------------+----------------------+
-|   GRPO    | Qwen2.5-32B-instruct |    0.30%    |        0.685      |  Atlas 200T A2 Box16 |
-+-----------+----------------------+-------------+-------------------+----------------------+
-|   DAPO    | Qwen2.5-7B-instruct  |    3.83%    |        pending    |  Atlas 200T A2 Box16 |
-+-----------+----------------------+-------------+-------------------+----------------------+
-
-目前支持 Qwen2.5 的 GRPO 训练，Qwen2.5-VL GRPO 训练在 vllm-ascend 的修复后支持，涉及到的issue为：
-
-1. `issues#809 <https://github.com/vllm-project/vllm-ascend/issues/809>`_
-
-2. `issues#825 <https://github.com/vllm-project/vllm-ascend/issues/825>`_
-
-
-精度对比说明
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-对于 SFT 类算法，我们期望在相同配置下华为昇腾设备与 A100 的 loss 平均绝对误差<= 2%。计算方式如下图。更多信息请参考 `精度计算说明 <https://www.hiascend.com/document/detail/zh/Pytorch/600/ptmoddevg/trainingmigrguide/LMaccuracy_0001.html>`_。
-
-.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/loss_comparison.png?raw=true
-   :alt: loss_comparison
-
-根据经验，对于 GRPO 等 RL 类算法，我们期望在相同配置下华为昇腾设备与 A100 的 rewards 平均绝对误差<= 4%，计算方式参考上图。
-
-
-吞吐对比说明
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-Ascend npu 和 A100 分别取日志中前4个 step 的 "perf/throughput" 做平均， throughput ratio = npu 平均值 / A100 平均值。 
-
-
-
-计划
-----------------------------------
-
-查看 `roadmap <https://github.com/volcengine/verl/discussions/900>`_ 获取更多特性的支持进度。
-
-
-
-声明
-----------------------------------
-verl中提供的ascend支持代码皆为参考样例，商业使用请通过官方正式途径沟通，谢谢。
\ No newline at end of file
--- a/docs/conf.py
+++ b/docs/conf.py
-# Copyright 2024 Bytedance Ltd. and/or its affiliates
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-# import os
-# import sys
-# sys.path.insert(0, os.path.abspath('.'))
-
-
-# -- Project information -----------------------------------------------------
-
-project = "verl"
-copyright = "2024 ByteDance Seed Foundation MLSys Team"
-author = "Guangming Sheng, Chi Zhang, Yanghua Peng, Haibin Lin"
-
-
-# -- General configuration ---------------------------------------------------
-# The master toctree document.
-master_doc = "index"
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    "myst_parser",
-    "sphinx.ext.autodoc",
-    "sphinx.ext.autosummary",
-    "sphinx.ext.autosectionlabel",
-    "sphinx.ext.napoleon",
-]
-# Use Google style docstrings instead of NumPy docstrings.
-napoleon_google_docstring = True
-napoleon_numpy_docstring = False
-
-# The suffix(es) of source filenames.
-# You can specify multiple suffix as a list of string:
-source_suffix = {
-    ".rst": "restructuredtext",
-    ".md": "markdown",
-}
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ["_templates"]
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = "en"
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
-
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_theme = "sphinx_rtd_theme"
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ["_static"]
-
-# Add the JavaScript file
-html_js_files = [
-    "js/runllm-widget.js",
-]
--- a/docs/data.rst
+++ b/docs/data.rst
-Data interface
-=========================
-
-DataProto is the interface for data exchange.
-
-The :class:`verl.DataProto` class contains two key members:
-
- batch: a :class:`tensordict.TensorDict` object for the actual data
- meta_info: a :class:`Dict` with additional meta information
-
-TensorDict
-~~~~~~~~~~~~
-
-:attr:`DataProto.batch` is built on top of :class:`tensordict`, a project in the PyTorch ecosystem.
-A TensorDict is a dict-like container for tensors. To instantiate a TensorDict, you must specify key-value pairs as well as the batch size.
-
-.. code-block:: python
-
-    >>> import torch
-    >>> from tensordict import TensorDict
-    >>> tensordict = TensorDict({"zeros": torch.zeros(2, 3, 4), "ones": torch.ones(2, 3, 5)}, batch_size=[2,])
-    >>> tensordict["twos"] = 2 * torch.ones(2, 5, 6)
-    >>> zeros = tensordict["zeros"]
-    >>> tensordict
-    TensorDict(
-    fields={
-        ones: Tensor(shape=torch.Size([2, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False),
-        twos: Tensor(shape=torch.Size([2, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False),
-        zeros: Tensor(shape=torch.Size([2, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)},
-    batch_size=torch.Size([2]),
-    device=None,
-    is_shared=False)
-
-One can also index a tensordict along its batch_size. The contents of the TensorDict can be manipulated collectively as well.
-
-.. code-block:: python
-
-    >>> tensordict[..., :1]
-    TensorDict(
-    fields={
-        ones: Tensor(shape=torch.Size([1, 3, 5]), device=cpu, dtype=torch.float32, is_shared=False),
-        twos: Tensor(shape=torch.Size([1, 5, 6]), device=cpu, dtype=torch.float32, is_shared=False),
-        zeros: Tensor(shape=torch.Size([1, 3, 4]), device=cpu, dtype=torch.float32, is_shared=False)},
-    batch_size=torch.Size([1]),
-    device=None,
-    is_shared=False)
-    >>> tensordict = tensordict.to("cuda:0")
-    >>> tensordict = tensordict.reshape(6)
-
-For more about :class:`tensordict.TensorDict` usage, see the official tensordict_ documentation.
-
-.. _tensordict: https://pytorch.org/tensordict/overview.html
-
-
-Core APIs
-~~~~~~~~~~~~~~~~~
-
-.. autoclass::  verl.DataProto
-   :members: to, select, union, make_iterator, concat
--- a/docs/examples/config.rst
+++ b/docs/examples/config.rst
--- a/docs/examples/gsm8k_example.rst
+++ b/docs/examples/gsm8k_example.rst
-GSM8K Example
-=============
-
-Introduction
------------
-
-In this example, we train an LLM to tackle the GSM8k task.
-
-Paper: https://arxiv.org/pdf/2110.14168
-
-Dataset: https://huggingface.co/datasets/gsm8k
-
-Note that the original paper mainly focuses on training a verifier (a
-reward model) to solve math problems via Best-of-N sampling. In this
-example, we train an RLHF agent using a rule-based reward model.
-
-Dataset Introduction
--------------------
-
-GSM8k is a math problem dataset. The prompt is an elementary school
-problem. The LLM model is required to answer the math problem.
-
-The training set contains 7473 samples and the test set contains 1319
-samples.
-
-**An example**
-
-Prompt
-
-   Katy makes coffee using teaspoons of sugar and cups of water in the
-   ratio of 7:13. If she used a total of 120 teaspoons of sugar and cups
-   of water, calculate the number of teaspoonfuls of sugar she used.
-
-Solution
-
-   The total ratio representing the ingredients she used to make the
-   coffee is 7+13 = <<7+13=20>>20 Since the fraction representing the
-   number of teaspoons she used is 7/20, she used 7/20\ *120 =
-   <<7/20*\ 120=42>>42 #### 42
-
-Step 1: Prepare dataset
-----------------------
-
-.. code:: bash
-
-   cd examples/data_preprocess
-   python3 gsm8k.py --local_dir ~/data/gsm8k
-
-Step 2: Download Model
----------------------
-
-There're three ways to prepare the model checkpoints for post-training:
-
- Download the required models from huggingface or modelscope
-
-.. code:: bash
-
-   huggingface-cli download deepseek-ai/deepseek-math-7b-instruct --local-dir ~/models/deepseek-math-7b-instruct --local-dir-use-symlinks False
-   # or
-   modelscope download --model deepseek-ai/deepseek-math-7b-instruct --local_dir ~/models/deepseek-math-7b-instruct
-
- Already store your store model in the local directory or HDFS path.
- Also, you can directly use the model name in huggingface (e.g.,
-  deepseek-ai/deepseek-math-7b-instruct) in
-  ``actor_rollout_ref.model.path`` and ``critic.model.path`` field in
-  the run script. You can also download models from modelscope by setting environmental variable ``VERL_USE_MODELSCOPE=True``.
-  See examples/ppo_trainer/run_deepseek7b_llm_modelscope.sh for example.
-
-Noted that users should prepare checkpoints for actor, critic and reward
-model.
-
-[Optional] Step 3: SFT your Model
---------------------------------
-
-We provide a SFT Trainer using PyTorch FSDP in
-`fsdp_sft_trainer.py <https://github.com/volcengine/verl/blob/main/verl/trainer/fsdp_sft_trainer.py>`_. 
-Users can customize their own SFT
-script using our FSDP SFT Trainer.
-
-We also provide various training scripts for SFT on GSM8K dataset in `gsm8k sft directory <https://github.com/volcengine/verl/blob/main/examples/sft/gsm8k/>`_.
-
-.. code:: shell
-
-   set -x
-
-   torchrun -m verl.trainer.fsdp_sft_trainer \
-       data.train_files=$HOME/data/gsm8k/train.parquet \
-       data.val_files=$HOME/data/gsm8k/test.parquet \
-       data.prompt_key=question \
-       data.response_key=answer \
-       data.micro_batch_size_per_gpu=8 \
-       model.partial_pretrain=deepseek-ai/deepseek-coder-6.7b-instruct \
-       trainer.default_hdfs_dir=hdfs://user/verl/experiments/gsm8k/deepseek-coder-6.7b-instruct/ \
-       trainer.project_name=gsm8k-sft \
-       trainer.experiment_name=gsm8k-sft-deepseek-coder-6.7b-instruct \
-       trainer.total_epochs=4 \
-       trainer.logger=['console','wandb']
-
-
-If you use AMD GPUs (ROCm kernel), you need to add the following environment variables into the run script:
-
-    .. code-block:: bash
-
-        export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-        export ROCR_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
-        export CUDA_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
-
-
-Step 4: Perform PPO training with your model on GSM8K Dataset
-------------------------------------------------------------
-
- Prepare your own run.sh script. Here's an example for GSM8k dataset
-  and deepseek-llm-7b-chat model.
- Users could replace the ``data.train_files`` ,\ ``data.val_files``,
-  ``actor_rollout_ref.model.path`` and ``critic.model.path`` based on
-  their environment.
- See :doc:`config` for detailed explanation of each config field.
-
-**Reward Model/Function**
-
-We use a rule-based reward model. We force the model to produce a final
-answer following 4 “#” as shown in the solution. We extract the final
-answer from both the solution and model's output using regular
-expression matching. We compare them and assign a reward of 1 to correct
-answer, 0.1 to incorrect answer and 0 to no answer.
-
-**Training Script**
-
-The training script example for FSDP and Megatron-LM backend are stored in examples/ppo_trainer directory.
-
-.. code:: bash
-
-   cd ../ppo_trainer
-   bash run_deepseek7b_llm.sh
-
-The script of run_deepseek7b_llm.sh
-
-.. code:: bash
-
-   set -x
-
-   python3 -m verl.trainer.main_ppo \
-      data.train_files=$HOME/data/gsm8k/train.parquet \
-      data.val_files=$HOME/data/gsm8k/test.parquet \
-      data.train_batch_size=1024 \
-      data.max_prompt_length=512 \
-      data.max_response_length=512 \
-      actor_rollout_ref.model.path=deepseek-ai/deepseek-llm-7b-chat \
-      actor_rollout_ref.actor.optim.lr=1e-6 \
-      actor_rollout_ref.model.use_remove_padding=True \
-      actor_rollout_ref.actor.ppo_mini_batch_size=256 \
-      actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
-      actor_rollout_ref.actor.fsdp_config.param_offload=False \
-      actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
-      actor_rollout_ref.model.enable_gradient_checkpointing=True \
-      actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=32 \
-      actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
-      actor_rollout_ref.rollout.name=vllm \
-      actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
-      actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=32 \
-      actor_rollout_ref.ref.fsdp_config.param_offload=True \
-      critic.optim.lr=1e-5 \
-      critic.model.use_remove_padding=True \
-      critic.model.path=deepseek-ai/deepseek-llm-7b-chat \
-      critic.model.enable_gradient_checkpointing=True \
-      critic.ppo_micro_batch_size_per_gpu=32 \
-      critic.model.fsdp_config.param_offload=False \
-      critic.model.fsdp_config.optimizer_offload=False \
-      algorithm.kl_ctrl.kl_coef=0.001 \
-      trainer.critic_warmup=0 \
-      trainer.logger=['console','wandb'] \
-      trainer.project_name='verl_example_gsm8k' \
-      trainer.experiment_name='deepseek_llm_7b_function_rm' \
-      trainer.n_gpus_per_node=8 \
-      trainer.nnodes=1 \
-      trainer.save_freq=-1 \
-      trainer.test_freq=1 \
-      trainer.total_epochs=15 $@
-
-
-If you use AMD GPUs (ROCm kernel), you need to add the following environment variables into the run script:
-
-    .. code-block:: bash
-
-        export HIP_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-        export ROCR_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
-        export CUDA_VISIBLE_DEVICES=$HIP_VISIBLE_DEVICES
-
-If you encounter any issues in using AMD GPUs running VeRL, feel free to contact me - `Yusheng Su <https://yushengsu-thu.github.io/>`_.
\ No newline at end of file
--- a/docs/examples/multi_modal_example.rst
+++ b/docs/examples/multi_modal_example.rst
-Multi-Modal Example Architecture
-=================================
-
-Introduction
------------
-
-Now, verl has supported multi-modal training. You can use fsdp and 
-vllm/sglang to start a multi-modal RL task. Megatron supports is also 
-on the way.
-
-Follow the steps below to quickly start a multi-modal RL task.
-
-Step 1: Prepare dataset
-----------------------
-
-.. code:: python
-
-    # it will be saved in the $HOME/data/geo3k folder
-    python examples/data_preprocess/geo3k.py
-
-Step 2: Download Model
----------------------
-
-.. code:: bash
-
-    # download the model from huggingface
-    python3 -c "import transformers; transformers.pipeline(model='Qwen/Qwen2.5-VL-7B-Instruct')"
-
-Step 3: Perform GRPO training with multi-modal model on Geo3K Dataset
---------------------------------------------------------------------
-
-.. code:: bash
-
-    # run the task
-    bash examples/grpo_trainer/run_qwen2_5_vl-7b.sh
-
-
-
-
-
-
-
-
--- a/docs/examples/ppo_code_architecture.rst
+++ b/docs/examples/ppo_code_architecture.rst
-PPO Example Architecture
-========================
-
-Let's start with the Proximal Policy Optimization algorithm, which is
-most widely used algorithm in LLM post-training.
-
-The main entry point of the PPO algorithm example is:
-`main_ppo.py <https://github.com/volcengine/verl/blob/main/verl/trainer/main_ppo.py>`_.
-In this tutorial, we will go through the code architecture in `main_ppo.py <https://github.com/volcengine/verl/blob/main/verl/trainer/main_ppo.py>`_.
-
-Define the data
---------------
-
-Users need to preprocess and store the dataset in parquet files.
-And we implement `RLHFDataset` to load and tokenize the parquet files.
-
-For ``RLHFDataset`` (Default), at least 1 fields are required:
-
- ``prompt``: Contains the string prompt
-
-We already provide some examples of processing the datasets to parquet
-files in `data_preprocess directory <https://github.com/volcengine/verl/blob/main/examples/data_preprocess>`_. Currently, we support
-preprocess of GSM8k, MATH, Hellasage, Full_hh_rlhf datasets. See :doc:`../preparation/prepare_data` for
-more information.
-
-Define the reward functions for different datasets
--------------------------------------------------
-
-In this main entry point, the users only need to define their own reward
-function based on the datasets (or applications) utilized in PPO
-training.
-
-For example, we already provide reward functions for `GSM8k <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score/gsm8k.py>`_ 
-and `MATH <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score/math.py>`_
-datasets in the ``_select_rm_score_fn``. In the ``RewardManager``, we
-will compute the reward score based on the data_source to select
-corresponding reward functions. For some RLHF datasets (e.g.,
-full_hh_rlhf), the reward model is utilized to assess the responses
-without any reward functions. In this case, the ``RewardManager`` will
-return the ``rm_score`` computed by the reward model directly.
-
-See `reward functions <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score>`_ for detailed implementation.
-
-Define worker classes
---------------------
-
-.. code:: python
-
-   if config.actor_rollout_ref.actor.strategy == 'fsdp': # for FSDP backend
-       assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
-       from verl.workers.fsdp_workers import ActorRolloutRefWorker, CriticWorker
-       from verl.single_controller.ray import RayWorkerGroup
-       ray_worker_group_cls = RayWorkerGroup
-
-   elif config.actor_rollout_ref.actor.strategy == 'megatron': # for Megatron backend
-       assert config.actor_rollout_ref.actor.strategy == config.critic.strategy
-       from verl.workers.megatron_workers import ActorRolloutRefWorker, CriticWorker
-       from verl.single_controller.ray.megatron import NVMegatronRayWorkerGroup
-       ray_worker_group_cls = NVMegatronRayWorkerGroup # Ray worker class for Megatron-LM
-
-   else:
-       raise NotImplementedError
-
-   from verl.trainer.ppo.ray_trainer import ResourcePoolManager, Role
-
-   role_worker_mapping = {
-       Role.ActorRollout: ActorRolloutRefWorker,
-       Role.Critic: CriticWorker,
-       Role.RefPolicy: ActorRolloutRefWorker
-   }
-
-   global_pool_id = 'global_pool'
-   resource_pool_spec = {
-       global_pool_id: [config.trainer.n_gpus_per_node] * config.trainer.nnodes,
-   }
-   mapping = {
-       Role.ActorRollout: global_pool_id,
-       Role.Critic: global_pool_id,
-       Role.RefPolicy: global_pool_id,
-   }
-
-Step 1: Construct the mapping between roles and workers
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-A role represents a group of workers in the same process. We have
-pre-defined several roles in `ray_trainer.py <https://github.com/volcengine/verl/blob/main/verl/trainer/ppo/ray_trainer.py#L38>`_.
-
-.. code:: python
-
-   class Role(Enum):
-       """
-       To create more roles dynamically, you can subclass Role and add new members
-       """
-       Actor = 0  # This worker only has Actor
-       Rollout = 1 # This worker only has Rollout
-       ActorRollout = 2 # This worker has both actor and rollout, it's a HybridEngine
-       Critic = 3 # This worker only has critic
-       RefPolicy = 4 # This worker only has reference policy
-       RewardModel = 5 # This worker only has reward model
-       ActorRolloutRef = 6 # This worker contains actor, rollout and reference policy simultaneously 
-
-Step 2: Define the worker class corresponding to this role
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
- We have pre-implemented the ``ActorRolloutRefWorker``. Through
-  different configs, it can be a standalone actor, a standalone rollout,
-  an ActorRollout HybridEngine, or an ActorRolloutRef HybridEngine
- We also pre-implemented workers for ``Actor``, ``Rollout``,
-  ``Critic``, ``Reward Model`` and ``Reference model`` on two different
-  backend: PyTorch FSDP
-  and Megatron-LM.
-  See `FSDP Workers <https://github.com/volcengine/verl/blob/main/verl/workers/fsdp_workers.py>`_ 
-  and `Megatron-LM Workers <https://github.com/volcengine/verl/blob/main/verl/workers/megatron_workers.py>`_
-  for more information.
-
-Step 3: Define resource pool id and resource pool spec
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
- Resource pool is a division of global GPU resources,
-  ``resource_pool_spec`` is a dict, mapping from id to # of GPUs
-
-  - In the above example, we defined a global resource pool:
-    global_pool_id, and then put all roles on this one resource pool
-    with all the GPUs in this post-training task. This refers to
-    *co-locate* placement where all the models share the same set of
-    GPUs.
-
- See resource pool and placement for advance usage.
-
-Defining reward model/function
------------------------------
-
-.. code:: python
-
-   # we should adopt a multi-source reward function here
-   # - for rule-based rm, we directly call a reward score
-   # - for model-based rm, we call a model
-   # - for code related prompt, we send to a sandbox if there are test cases
-   # - finally, we combine all the rewards together
-   # - The reward type depends on the tag of the data
-   if config.reward_model.enable:
-       from verl.workers.fsdp_workers import RewardModelWorker
-       role_worker_mapping[Role.RewardModel] = RewardModelWorker
-       mapping[Role.RewardModel] = global_pool_id
-    
-   reward_fn = RewardManager(tokenizer=tokenizer, num_examine=0)
-
-   # Note that we always use function-based RM for validation
-   val_reward_fn = RewardManager(tokenizer=tokenizer, num_examine=1)
-
-   resource_pool_manager = ResourcePoolManager(resource_pool_spec=resource_pool_spec, mapping=mapping)
-
-Since not all tasks use model-based RM, users need to define here
-whether it's a model-based RM or a function-based RM
-
- If it's a model-based RM, directly add the ``RewardModel`` role in the
-  resource mapping and add it to the resource pool mapping.
-
-  - Note that the pre-defined ``RewardModelWorker`` only supports models
-    with the structure of huggingface
-    ``AutoModelForSequenceClassification``. If it's not this model, you
-    need to define your own RewardModelWorker in `FSDP Workers <https://github.com/volcengine/verl/blob/main/verl/workers/fsdp_workers.py>`_ 
-    and `Megatron-LM Workers <https://github.com/volcengine/verl/blob/main/verl/workers/megatron_workers.py>`_.
-
- If it's a function-based RM, the users are required to classified the
-  reward function for each datasets.
-
-.. code:: python
-
-   def _select_rm_score_fn(data_source):
-       if data_source == 'openai/gsm8k':
-           return gsm8k.compute_score
-       elif data_source == 'lighteval/MATH':
-           return math.compute_score
-       else:
-           raise NotImplementedError
-
-See reward functions implemented in `directory <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score/>`_ 
-for more information.
-
-Define, init and run the PPO Trainer
------------------------------------
-
-.. code:: python
-
-   trainer = RayPPOTrainer(config=config,
-                           tokenizer=tokenizer,
-                           role_worker_mapping=role_worker_mapping,
-                           resource_pool_manager=resource_pool_manager,
-                           ray_worker_group_cls=ray_worker_group_cls,
-                           reward_fn=reward_fn,
-                           val_reward_fn=val_reward_fn)
-   trainer.init_workers()
-   trainer.fit()
-
- We first initialize the ``RayPPOTrainer`` with user config, tokenizer
-  and all the above worker mapping, resource pool, worker group and
-  reward functions
- We first call the ``trainer.init_workers()`` to initialize the models
-  on the allocated GPUs (in the resource pool)
- The actual PPO training will be executed in ``trainer.fit()``
-
-verl can be easily extended to other RL algorithms by reusing the Ray
-model workers, resource pool and reward functions. See :doc:`extension<../advance/dpo_extension>` for
-more information.
-
-Details of the ``RayPPOTrainer`` is discussed in :doc:`Ray Trainer<../workers/ray_trainer>`.
--- a/docs/examples/sandbox_fusion_example.rst
+++ b/docs/examples/sandbox_fusion_example.rst
-Sandbox Fusion Example
-============================
-
-Introduction
------------
-
-Sandbox Fusion is a remote code sandbox service that provides a secure environment for running and evaluating code generated by Large Language Models (LLMs). This example demonstrates how to train an LLM and use Sandbox Fusion to verify generated code, enhancing both security and performance.
-
-By leveraging a remote code sandbox service with greater CPU resources for concurrent code verification, you can reduce the reward stage time by 10-30%, depending on the quality of the generated code.
-
-Step 1: Prepare the Dataset
---------------------------
-
-We use the Eurus-2-RL-Data dataset for training. This dataset combines math and code questions, making it suitable for LLM training tasks. You can download it from HuggingFace: `Eurus-2-RL-Data Dataset <https://huggingface.co/datasets/PRIME-RL/Eurus-2-RL-Data>`_.
-
-Step 2: Set Up the Sandbox Fusion Service
-----------------------------------------
-
-Sandbox Fusion is a remote code sandbox service designed to securely run and evaluate LLM-generated code. To use it:
-
-1. **Access Full Documentation**: For detailed setup instructions, refer to the `Sandbox Fusion Documentation <https://bytedance.github.io/SandboxFusion/>`_.
-2. **Deploy the Service**: Choose one of the following deployment methods:
-
-   - **Local Deployment**: Follow the guide `here <https://bytedance.github.io/SandboxFusion/docs/docs/get-started#local-deployment>`_.
-   - **FaaS Instance (Volcengine)**: Create an instance using the `Volcengine Documentation <https://www.volcengine.com/docs/6662/1539235>`_.
-
-After deployment, you will receive an API endpoint in the format: ``https://<ip-address-or-domain-name>/run_code``.
-
-Step 3: Configure the Training Script
-------------------------------------
-
-To integrate Sandbox Fusion into your training script, configure the following parameters:
-
-**Key Settings for Sandbox Fusion**
-
- ``reward_model.sandbox_fusion.url='<API-endpoint>'``: Enable Sandbox Fusion by specifying the API endpoint (must end with ``/run_code``).
- ``reward_model.sandbox_fusion.max_concurrent=256``: Set the maximum number of concurrent API requests to the Sandbox Fusion service.
-
-**Additional Optimization**
-
-To further reduce code verification time, enable parallel processing with:  
-
- ``reward_model.reward_manager=prime``: The Prime reward manager verifies code across multiple subprocesses concurrently.
-
-**Example Script**
-
-For a practical implementation, refer to the example script:  
-
-``examples/ppo_trainer/run_deepseek7b_llm_sandbox_fusion.sh``
-
-Once you’ve set your API endpoint in the script, you can start the training job.
\ No newline at end of file
--- a/docs/experiment/ppo.rst
+++ b/docs/experiment/ppo.rst
-.. _algo-baseline-page:
-
-Algorithm Baselines
-===================
-
-GSM8k 
------------------
-
-Assuming GSM8k dataset is preprocess via ``python3 examples/data_preprocess/gsm8k.py``
-
-Refer to the table below to reproduce PPO training from different pre-trained models.
-
-.. _Huggingface: https://huggingface.co/google/gemma-2-2b-it#benchmark-results
-.. _SFT Command and Logs: https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/gemma-2-2b-it-sft-0.411.log
-.. _SFT+PPO Command and Logs: https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/gemma-2-2b-it-ppo-bsz512_4-prompt1024-resp-512-0.640.log
-.. _wandb: https://api.wandb.ai/links/verl-team/h7ux8602
-.. _Qwen Blog: https://qwenlm.github.io/blog/qwen2.5-llm/
-.. _PPO Command and Logs: https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/Qwen2.5-0.5B-bsz256_2-prompt1024-resp512-0.567.log
-.. _Megatron PPO Command and Logs: https://github.com/eric-haibin-lin/verl-data/blob/experiments/gsm8k/deepseek-llm-7b-chat-megatron-bsz256_4-prompt512-resp512-0.695.log
-.. _Qwen7b GRPO Script: https://github.com/volcengine/verl/blob/a65c9157bc0b85b64cd753de19f94e80a11bd871/examples/grpo_trainer/run_qwen2-7b_seq_balance.sh
-.. _Megatron wandb: https://wandb.ai/verl-team/verl_megatron_gsm8k_examples/runs/10fetyr3
-.. _Qwen7b ReMax Script: https://github.com/eric-haibin-lin/verl/blob/main/examples/remax_trainer/run_qwen2.5-3b_seq_balance.sh
-.. _Qwen7b ReMax Wandb: https://wandb.ai/liziniu1997/verl_remax_example_gsm8k/runs/vxl10pln
-
-+----------------------------------+------------------------+------------+-----------------------------------------------------------------------------------------------+
-| Model                            | Method                 | Test score |  Details                                                                                      |
-+==================================+========================+============+=====================+=========================================================================+
-| google/gemma-2-2b-it             | pretrained checkpoint  | 23.9       |   `Huggingface`_                                                                              |
-+----------------------------------+------------------------+------------+-----------------------------------------------------------------------------------------------+
-| google/gemma-2-2b-it             | SFT                    | 52.06      |   `SFT Command and Logs`_                                                                     |
-+----------------------------------+------------------------+------------+-----------------------------------------------------------------------------------------------+
-| google/gemma-2-2b-it             | SFT + PPO              | 64.02      |   `SFT+PPO Command and Logs`_, `wandb`_                                                       |
-+----------------------------------+------------------------+------------+-----------------------------------------------------------------------------------------------+
-| Qwen/Qwen2.5-0.5B-Instruct       | pretrained checkpoint  | 36.4       |   `Qwen Blog`_                                                                                |
-+----------------------------------+------------------------+------------+-----------------------------------------------------------------------------------------------+
-| Qwen/Qwen2.5-0.5B-Instruct       | PPO                    | 56.7       |   `PPO Command and Logs`_                                                                     |
-+----------------------------------+------------------------+------------+-----------------------------------------------------------------------------------------------+
-| deepseek-ai/deepseek-llm-7b-chat | PPO                    | 69.5 [1]_  |   `Megatron PPO Command and Logs`_, `Megatron wandb`_                                         |
-+----------------------------------+------------------------+------------+-----------------------------------------------------------------------------------------------+
-| Qwen/Qwen2-7B-Instruct           | GRPO                   | 89         |   `Qwen7b GRPO Script`_                                                                       |
-+----------------------------------+------------------------+------------+-----------------------------------------------------------------------------------------------+
-| Qwen/Qwen2.5-7B-Instruct         | ReMax                  | 97         |   `Qwen7b ReMax Script`_, `Qwen7b ReMax Wandb`_                                               |
-+----------------------------------+------------------------+------------+-----------------------------------------------------------------------------------------------+
-
-.. [1] During the evaluation, we have only extracted answers following the format "####". A more flexible answer exaction, longer response length and better prompt engineering may lead to higher score.
--- a/docs/faq/faq.rst
+++ b/docs/faq/faq.rst
-Frequently Asked Questions
-====================================
-
-Ray related
------------
-
-How to add breakpoint for debugging with distributed Ray?
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Please checkout the official debugging guide from Ray: https://docs.ray.io/en/latest/ray-observability/ray-distributed-debugger.html
-
-
-"Unable to register worker with raylet"
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The cause of this issue is due to some system setting, e.g., SLURM added some constraints on how the CPUs are shared on a node. 
-While `ray.init()` tries to launch as many worker processes as the number of CPU cores of the machine,
-some constraints of SLURM restricts the `core-workers` seeing the `raylet` process, leading to the problem.
-
-To fix this issue, you can set the config term ``ray_init.num_cpus`` to a number allowed by your system.
-
-Distributed training
------------------------
-
-How to run multi-node post-training with Ray?
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-You can start a ray cluster and submit a ray job, following the official guide from Ray: https://docs.ray.io/en/latest/ray-core/starting-ray.html
-
-Then in the configuration, set the ``trainer.nnode`` config to the number of machines for your job.
-
-How to use verl on a Slurm-managed cluster?
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Ray provides users with `this <https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html>`_ official
-tutorial to start a Ray cluster on top of Slurm. We have verified the :doc:`GSM8K example<../examples/gsm8k_example>`
-on a Slurm cluster under a multi-node setting with the following steps.
-
-1. [Optional] If your cluster support `Apptainer or Singularity <https://apptainer.org/docs/user/main/>`_ and you wish
-to use it, convert verl's Docker image to an Apptainer image. Alternatively, set up the environment with the package
-manager available on your cluster or use other container runtimes (e.g. through `Slurm's OCI support <https://slurm.schedmd.com/containers.html>`_) available to you.
-
-.. code:: bash
-
-    apptainer pull /your/dest/dir/vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3.sif docker://verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3
-
-2. Follow :doc:`GSM8K example<../examples/gsm8k_example>` to prepare the dataset and model checkpoints.
-
-3. Modify `examples/slurm/ray_on_slurm.slurm <https://github.com/volcengine/verl/blob/main/examples/slurm/ray_on_slurm.slurm>`_ with your cluster's own information.
-
-4. Submit the job script to the Slurm cluster with `sbatch`.
-
-Please note that Slurm cluster setup may vary. If you encounter any issues, please refer to Ray's
-`Slurm user guide <https://docs.ray.io/en/latest/cluster/vms/user-guides/community/slurm.html>`_ for common caveats.
-
-If you changed Slurm resource specifications, please make sure to update the environment variables in the job script if necessary.
-
-
-Install related
------------------------
-
-NotImplementedError: TensorDict does not support membership checks with the `in` keyword. 
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Detail error information: 
-
-.. code:: bash
-
-    NotImplementedError: TensorDict does not support membership checks with the `in` keyword. If you want to check if a particular key is in your TensorDict, please use `key in tensordict.keys()` instead.
-
-Cause of the problem: There is no suitable version of tensordict package for the linux-arm64 platform. The confirmation method is as follows:
-
-.. code:: bash
-
-    pip install tensordict==0.6.2
-
-Output example:
-
-.. code:: bash
-
-    ERROR: Could not find a version that satisfies the requirement tensordict==0.6.2 (from versions: 0.0.1a0, 0.0.1b0, 0.0.1rc0, 0.0.2a0, 0.0.2b0, 0.0.3, 0.1.0, 0.1.1, 0.1.2, 0.8.0, 0.8.1, 0.8.2, 0.8.3)
-    ERROR: No matching distribution found for tensordict==0.6.2
-
-Solution 1st:
-  Install tensordict from source code:
-
-.. code:: bash
-
-    pip uninstall tensordict
-    git clone https://github.com/pytorch/tensordict.git
-    cd tensordict/
-    git checkout v0.6.2
-    python setup.py develop
-    pip install -v -e .
-
-Solution 2nd:
-  Temperally modify the error takeplace codes: tensordict_var -> tensordict_var.keys()
-
-
-Illegal memory access
---------------------------------
-
-If you encounter the error message like ``CUDA error: an illegal memory access was encountered`` during rollout, most likely it is due to a known issue from vllm(<=0.6.3).
-Please set the following environment variable. The env var must be set before the ``ray start`` command if any.
-
-.. code:: bash
-
-    export VLLM_ATTENTION_BACKEND=XFORMERS
-
-If in doubt, print this env var in each rank to make sure it is properly set.
-
-Checkpoints
------------------------
-
-If you want to convert the model checkpoint into huggingface safetensor format, please refer to ``scripts/model_merger.py``.
-
-
-Triton ``compile_module_from_src`` error
------------------------------------------------
-
-If you encounter triton compilation error similar to the stacktrace below, please set the ``use_torch_compile`` flag according to
-https://verl.readthedocs.io/en/latest/examples/config.html to disable just-in-time compilation for fused kernels.
-
-.. code:: bash
-
-  File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/jit.py", line 345, in <lambda>
-    return lambda *args, **kwargs: self.run(grid=grid, warmup=False, *args, **kwargs)
-  File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/autotuner.py", line 338, in run
-    return self.fn.run(*args, **kwargs)
-  File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/jit.py", line 607, in run
-    device = driver.active.get_current_device()
-  File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/driver.py", line 23, in __getattr__
-    self._initialize_obj()
-  File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/driver.py", line 20, in _initialize_obj
-    self._obj = self._init_fn()
-  File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/driver.py", line 9, in _create_driver
-    return actives[0]()
-  File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/backends/nvidia/driver.py", line 371, in __init__
-    self.utils = CudaUtils()  # TODO: make static
-  File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/backends/nvidia/driver.py", line 80, in __init__
-    mod = compile_module_from_src(Path(os.path.join(dirname, "driver.c")).read_text(), "cuda_utils")
-  File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/backends/nvidia/driver.py", line 57, in compile_module_from_src
-    so = _build(name, src_path, tmpdir, library_dirs(), include_dir, libraries)
-  File "/data/lbh/conda_envs/verl/lib/python3.10/site-packages/triton/runtime/build.py", line 48, in _build
-    ret = subprocess.check_call(cc_cmd)
-  File "/data/lbh/conda_envs/verl/lib/python3.10/subprocess.py", line 369, in check_call
-    raise CalledProcessError(retcode, cmd)
-
-What is the meaning of train batch size, mini batch size, and micro batch size?
------------------------------------------------------------------------------------------
-
-This figure illustrates the relationship between different batch size configurations.
-
-https://excalidraw.com/#json=pfhkRmiLm1jnnRli9VFhb,Ut4E8peALlgAUpr7E5pPCA
-
-.. image:: https://github.com/user-attachments/assets/16aebad1-0da6-4eb3-806d-54a74e712c2d
-
-How to generate ray timeline to analyse performance of a training job?
------------------------------------------------------------------------------------------
-
-To generate the ray timeline file, you can set the config term ``ray_init.timeline_file`` to a json file path.
-For example:
-
-.. code:: bash
-
-    ray_init.timeline_file=/tmp/ray_timeline.json
-  
-The file will be generated in the specified path at the end of a training job.
-You can use tools like chrome://tracing or the Perfetto UI and view the ray timeline file.
-
-This figure shows the ray timeline file generated by from a training job on 1 node with 4 GPUs
-
-.. image:: https://github.com/eric-haibin-lin/verl-community/blob/main/docs/ray_timeline.png?raw=true
-
-How to set proxy only for wandb?
------------------------------------------------------------------------------------------
-
-If you need a proxy to access wandb, you can add below config in your training job script.
-Comparing to using global https_proxy env variable, this approach won't mess up other http requests, such as ChatCompletionScheduler.
-
-.. code:: bash
-
-  +trainer.wandb_proxy=http://<your proxy and port>
--- a/docs/hybrid_flow.rst
+++ b/docs/hybrid_flow.rst
--- a/docs/index.rst
+++ b/docs/index.rst
-Welcome to verl's documentation!
-================================================
-
-verl is a flexible, efficient and production-ready RL training framework designed for large language models (LLMs) post-training. It is an open source implementation of the `HybridFlow <https://arxiv.org/pdf/2409.19256>`_ paper.
-
-verl is flexible and easy to use with:
-
- **Easy extension of diverse RL algorithms**: The hybrid programming model combines the strengths of single-controller and multi-controller paradigms to enable flexible representation and efficient execution of complex Post-Training dataflows. Allowing users to build RL dataflows in a few lines of code.
-
- **Seamless integration of existing LLM infra with modular APIs**: Decouples computation and data dependencies, enabling seamless integration with existing LLM frameworks, such as PyTorch FSDP, Megatron-LM, vLLM and SGLang. Moreover, users can easily extend to other LLM training and inference frameworks.
-
- **Flexible device mapping and parallelism**: Supports various placement of models onto different sets of GPUs for efficient resource utilization and scalability across different cluster sizes.
-
- Ready integration with popular HuggingFace models
-
-
-verl is fast with:
-
- **State-of-the-art throughput**: By seamlessly integrating existing SOTA LLM training and inference frameworks, verl achieves high generation and training throughput.
-
- **Efficient actor model resharding with 3D-HybridEngine**: Eliminates memory redundancy and significantly reduces communication overhead during transitions between training and generation phases.
-
--------------------------------------------
-
-.. _Contents:
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Quickstart
-
-   start/install
-   start/quickstart
-   start/multinode
-   start/ray_debug_tutorial
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Programming guide
-
-   hybrid_flow
-   single_controller
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Data Preparation
-
-   preparation/prepare_data
-   preparation/reward_function
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Configurations
-
-   examples/config
-
-.. toctree::
-   :maxdepth: 1
-   :caption: PPO Example
-
-   examples/ppo_code_architecture
-   examples/gsm8k_example
-   examples/multi_modal_example
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Algorithms
-
-   algo/ppo.md
-   algo/grpo.md
-   algo/dapo.md
-   algo/spin.md
-   algo/sppo.md
-   algo/opo.md
-   algo/baseline.md
-
-.. toctree:: 
-   :maxdepth: 1
-   :caption: PPO Trainer and Workers
-
-   workers/ray_trainer
-   workers/fsdp_workers
-   workers/megatron_workers
-   workers/sglang_worker
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Performance Tuning Guide
-
-   perf/dpsk.md
-   perf/perf_tuning
-   README_vllm0.8.md
-   perf/device_tuning
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Adding new models
-
-   advance/fsdp_extension
-   advance/megatron_extension
-
-.. toctree::
-   :maxdepth: 1
-   :caption: Advanced Features
-
-   advance/checkpoint
-   advance/rope
-   advance/ppo_lora.rst
-   sglang_multiturn/multiturn.rst
-   advance/placement
-   advance/dpo_extension
-   examples/sandbox_fusion_example
-
-.. toctree::
-   :maxdepth: 1
-   :caption: API References
-
-   api/data
-   api/single_controller.rst
-   api/trainer.rst
-   api/utils.rst
-
-
-.. toctree::
-   :maxdepth: 2
-   :caption: FAQ
-
-   faq/faq
-
-Contribution
-------------
-
-verl is free software; you can redistribute it and/or modify it under the terms
-of the Apache License 2.0. We welcome contributions.
-Join us on `GitHub <https://github.com/volcengine/verl>`_, `Slack <https://join.slack.com/t/verlgroup/shared_invite/zt-2w5p9o4c3-yy0x2Q56s_VlGLsJ93A6vA>`_ and `Wechat <https://raw.githubusercontent.com/eric-haibin-lin/verl-community/refs/heads/main/WeChat.JPG>`_ for discussions.
-
-Contributions from the community are welcome! Please check out our `project roadmap <https://github.com/volcengine/verl/issues/710>`_ and `good first issues <https://github.com/volcengine/verl/issues?q=is%3Aissue%20state%3Aopen%20label%3A%22good%20first%20issue%22>`_ to see where you can contribute.
-
-Code Linting and Formatting
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-We use pre-commit to help improve code quality. To initialize pre-commit, run:
-
-.. code-block:: bash
-
-   pip install pre-commit
-   pre-commit install
-
-To resolve CI errors locally, you can also manually run pre-commit by:
-
-.. code-block:: bash
-
-   pre-commit run
-
-Adding CI tests
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-If possible, please add CI test(s) for your new feature:
-
-1. Find the most relevant workflow yml file, which usually corresponds to a ``hydra`` default config (e.g. ``ppo_trainer``, ``ppo_megatron_trainer``, ``sft_trainer``, etc).
-2. Add related path patterns to the ``paths`` section if not already included.
-3. Minimize the workload of the test script(s) (see existing scripts for examples).
-
-We are HIRING! Send us an `email <mailto:haibin.lin@bytedance.com>`_ if you are interested in internship/FTE opportunities in MLSys/LLM reasoning/multimodal alignment.
--- a/docs/perf/device_tuning.rst
+++ b/docs/perf/device_tuning.rst
-Resource Needed for verl RL(LoRA)
-==============================
-
-Since RL requires more resources compared to regular training, 
-determining how much resources are needed to successfully run it before training 
-is a relatively difficult task. To provide more people with reference points for 
-resource selection when dealing with different models and tasks, this section is 
-mainly dedicated to introducing the environmental requirements based on experiments 
-we have conducted.
-
-However, due to limited staff and equipment resources, we also hope for more 
-contributions from the open-source community. When submitting a PR, it is necessary 
-to provide a script to be added to the example/tuning scripts.
-
-We need two types of scripts: one is the configuration that can run with the **minimum 
-resources(min)**, and the other is the configuration that runs with **recommended resources(recommended)**. For the former, 
-it can be understood as a script that can run after applying all memory optimization techniques 
-(e.g., offload, gradient checkpointing). For the latter, it can be understood as a script that 
-can run while avoiding operations that incur additional time overhead as much as possible (targetting best throughput).
-
-When defining script names, please follow this format: 
-``[model]_[task]_[gpunums]_[device]_[train]_[infer].sh``. This will effectively improve 
-the script's recognizability. You can place the script under the ``examples/tuning/`` directory.
-
-If you happen to have a configuration that has already been tested, we welcome you to submit 
-a PR and include a screenshot from Wandb or other verifiable evidence.
-
----------------------------------------
-
-0.5B
-~~~
-
-.. list-table::
-    :widths: auto
-    :header-rows: 1
-    
-    * - Tag
-      - Model
-      - Task
-      - Resource
-      - MaxBatch
-      - Train
-      - Infer
-      - Link
-      - Contributor
-    * - MIN
-      - Qwen2.5-0.5B
-      - GRPO-LoRA
-      - 1*H100
-      - 116
-      - fsdp
-      - vllm0.8.3
-      - `qwen2-0.5b_grpo-lora_1_h100_fsdp_vllm.sh <https://github.com/volcengine/verl/blob/main/examples/tuning/0.5b/qwen2-0.5b_grpo-lora_1_h100_fsdp_vllm.sh>`_
-      - `SimonHuang <thelongestusernameofall@gmail.com>`_
-
-1.5B
-~~~
-
-.. list-table::
-    :widths: auto
-    :header-rows: 1
-    
-    * - Tag
-      - Model
-      - Task
-      - Resource
-      - MaxBatch
-      - Train
-      - Infer
-      - Link
-      - Contributor
-    * - MIN
-      - Qwen2.5-1.5B
-      - GRPO-LoRA
-      - 1*H100
-      - 128
-      - fsdp
-      - vllm0.8.3
-      - `qwen2-1.5b_grpo-lora_1_h100_fsdp_vllm.sh <https://github.com/volcengine/verl/blob/main/examples/tuning/1.5b/qwen2-1.5b_grpo-lora_1_h100_fsdp_vllm.sh>`_
-      - `SimonHuang <thelongestusernameofall@gmail.com>`_
-
-3B
-~~~
-
-.. list-table::
-    :widths: auto
-    :header-rows: 1
-    
-    * - Tag
-      - Model
-      - Task
-      - Resource
-      - MaxBatch
-      - Train
-      - Infer
-      - Link
-      - Contributor
-    * - MIN
-      - Qwen2.5-3B
-      - GRPO-LoRA
-      - 1*H100
-      - 62
-      - fsdp
-      - vllm0.8.3
-      - `qwen2-3b_grpo-lora_1_h100_fsdp_vllm.sh <https://github.com/volcengine/verl/blob/main/examples/tuning/3b/qwen2-3b_grpo-lora_1_h100_fsdp_vllm.sh>`_
-      - `SimonHuang <thelongestusernameofall@gmail.com>`_
-
-7B
-~~~
-
-.. list-table::
-    :widths: auto
-    :header-rows: 1
-    
-    * - Tag
-      - Model
-      - Task
-      - Resource
-      - MaxBatch
-      - Train
-      - Infer
-      - Link
-      - Contributor
-    * - MIN
-      - Qwen2-7B
-      - GRPO
-      - 2*H800
-      - \
-      - fsdp
-      - vllm0.8.2
-      - `qwen2-7b_grpo_2_h800_fsdp_vllm <https://github.com/volcengine/verl/blob/main/examples/tuning/7b/qwen2-7b_grpo_2_h800_fsdp_vllm.sh>`_
-      - `Xiangyongan <xiangyongan@bytedance.com>`_
-    * - MIN
-      - Qwen2.5-7B
-      - GRPO-LoRA
-      - 1*H100
-      - 16
-      - fsdp
-      - vllm0.8.3
-      - `qwen2-7b_grpo-lora_1_h100_fsdp_vllm.sh <https://github.com/volcengine/verl/blob/main/examples/tuning/7b/qwen2-7b_grpo-lora_1_h100_fsdp_vllm.sh>`_
-      - `SimonHuang <thelongestusernameofall@gmail.com>`_
-
-14B
-~~~
-
-.. list-table::
-    :widths: auto
-    :header-rows: 1
-    
-    * - Tag
-      - Model
-      - Task
-      - Resource
-      - MaxBatch
-      - Train
-      - Infer
-      - Link
-      - Contributor
-    * - MIN
-      - Qwen2-14B
-      - GRPO
-      - 4*H800
-      - \
-      - fsdp
-      - vllm0.8.2
-      - `qwen2-14b_grpo_4_h800_fsdp_vllm <https://github.com/volcengine/verl/blob/main/examples/tuning/14b/qwen2-14b_grpo_4_h800_fsdp_vllm.sh>`_
-      - `Xiangyongan <xiangyongan@bytedance.com>`_
-    * - MIN
-      - Qwen2.5-14B
-      - GRPO-LoRA
-      - 2*H100
-      - 116
-      - fsdp
-      - vllm0.8.3
-      - `qwen2-14b_grpo-lora_2_h100_fsdp_vllm.sh <https://github.com/volcengine/verl/blob/main/examples/tuning/14b/qwen2-14b_grpo-lora_2_h100_fsdp_vllm.sh>`_
-      - `SimonHuang <thelongestusernameofall@gmail.com>`_
-
-32B
-~~~
-
-.. list-table::
-    :widths: auto
-    :header-rows: 1
-    
-    * - Tag
-      - Model
-      - Task
-      - Resource
-      - MaxBatch
-      - Train
-      - Infer
-      - Link
-      - Contributor
-    * - MIN
-      - Qwen2-32B
-      - GRPO
-      - 8*H20
-      - \
-      - megatron
-      - vllm0.8.2
-      - `qwen2-32b_grpo_8_h20_megatron_vllm <https://github.com/volcengine/verl/tree/main/examples/tuning/32b/qwen2_32B_grpo_8_h20_megatron_vllm.sh>`_
-      - `Xiangyongan <xiangyongan@bytedance.com>`_
-    * - MIN
-      - Qwen2.5-32B
-      - GRPO-LoRA
-      - 4*H100
-      - 180
-      - fsdp
-      - vllm0.8.3
-      - `qwen2-32b_grpo-lora_4_h100_fsdp_vllm.sh <https://github.com/volcengine/verl/blob/main/examples/tuning/32b/qwen2-32b_grpo-lora_4_h100_fsdp_vllm.sh>`_
-      - `SimonHuang <thelongestusernameofall@gmail.com>`_
-
-70B
-~~~
-
-.. list-table::
-    :widths: auto
-    :header-rows: 1
-
-    * - Tag
-      - Model
-      - Task
-      - Resource
-      - MaxBatch
-      - Train
-      - Infer
-      - Link
-      - Contributor
-    * - MIN
-      - Qwen2-70B
-      - GRPO
-      - 32*H20
-      - \
-      - fsdp
-      - vllm0.8.2
-      - `qwen2-70b_grpo_32_h20_fsdp_vllm <https://github.com/volcengine/verl/blob/main/examples/tuning/70b/qwen2-70b_grpo_32_h20_fsdp_vllm.sh>`_
-      - `Xiangyongan <xiangyongan@bytedance.com>`_
-    * - MIN
-      - Qwen2-70B
-      - GRPO
-      - 32*H800
-      - \
-      - fsdp
-      - vllm0.8.3
-      - `qwen2-70b_grpo_32_h800_fsdp_vllm <https://github.com/volcengine/verl/blob/main/examples/tuning/70b/qwen2-70b_grpo_32_h800_fsdp_vllm.sh>`_
-      - `Xiangyongan <xiangyongan@bytedance.com>`_
-    * - MIN
-      - Qwen2.5-72B
-      - GRPO-LoRA
-      - 8*H100
-      - 176
-      - fsdp
-      - vllm0.8.3
-      - `qwen2-72b_grpo-lora_8_h100_fsdp_vllm.sh <https://github.com/volcengine/verl/blob/main/examples/tuning/70b/qwen2-72b_grpo-lora_8_h100_fsdp_vllm.sh>`_
-      - `SimonHuang <thelongestusernameofall@gmail.com>`_
-
-405B
-~~~~
-
-.. table::
-   :widths: auto
-
-   ====== ====== ====== ======== ======== ====== ====== ======
-   tag    model  task   resource MaxBatch train  infer  link
-   ====== ====== ====== ======== ======== ====== ====== ======
-   \      \      \        \        \      \      \
-   ====== ====== ====== ======== ======== ====== ====== ======
-
-671B
-~~~~
-
-.. table::
-   :widths: auto
-
-   ====== ====== ====== ======== ======== ====== ====== ======
-   tag    model  task   resource MaxBatch train  infer  link
-   ====== ====== ====== ======== ======== ====== ====== ======
-   \      \      \        \        \      \      \
-   ====== ====== ====== ======== ======== ====== ====== ======
--- a/docs/perf/dpsk.md
+++ b/docs/perf/dpsk.md
-# Training DeepSeek 671b
-
-verl integrates Megatron to support large MoE models such as `Qwen3-235B-A22B` and `deepseek-ai/DeepSeek-V3`. This is an ongoing community effort.
-
-In the journey the community added the following features and optimizations that enable verl with larger models:
- per tensor weight resharding between rollout and training
- context parallelism and expert parallelism enabled via megatron
- dynamic batch size (sequence balance) for megatron
- reduced ray-related serialization overhead
- optimizer offloading, recomputation, and efficient kernels
- various debugging metrics and utils
-
-and the megatron backend now has a wider list of models supported:
- DeepSeek-V3
- Moonlight
- Qwen3
- Qwen2.5-VL (to be merged soon)
- Qwen2
- Mixtral
-
-## Getting Started
-
-### DeepSeek 671b
-
-The recommended image with pre-built megatron dependency is `whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6.post5-mcore0.12.1-te2.3-deepseekv3`, built with the Dockerfile in [docker/Dockerfile.vllm.sglang.megatron.deepseek](https://github.com/volcengine/verl/blob/main/docker/Dockerfile.vllm.sglang.megatron.deepseek).
-
-For checkpoint loading, we rely on megatron dist-ckpt for resharding. A converted dist-ckpt for DeepSeek-V3 is available from [huggingface BearBiscuit05/dpsk-v3-671B-BF16-dist_ckpt](https://huggingface.co/BearBiscuit05/dpsk-v3-671B-BF16-dist_ckpt/tree/main).
-
-To run end-to-end training on the DAPO dataset, run [recipe/dapo/test_dapo_dspk_671b_megatron.sh](https://github.com/volcengine/verl/blob/main/recipe/dapo/test_dapo_dspk_671b_megatron.sh). It runs on 512 H20(96GB) GPUs with the following setup:
- vllm rollout with TP=32, bfloat16
- megatron training with attention DP, MoE EP=32, PP=16, bfloat16
-
-MTP is disabled during RL training.
-
-### Qwen3 236b
-
-For Qwen3-236b, please refer to [examples/grpo_trainer/run_qwen3-236b_megatron.sh](https://github.com/volcengine/verl/blob/main/examples/grpo_trainer/run_qwen3-236b_megatron.sh), which runs on 128 H20(96GB) GPUs.
-
-## Upcoming Optimizations
-
-The community continue to optimize large MoE models further, ongoing efforts include:
- further optimizing memory consumption, and provide recommended/tuned configurations with various machine types
- optimizing long context RL training performance
- performance improvement with SGLang x Megatron
-
-We invite the community to try and improve verl together. Get connected with us on [slack](https://join.slack.com/t/verlgroup/shared_invite/zt-2w5p9o4c3-yy0x2Q56s_VlGLsJ93A6vA)/[wechat](https://raw.githubusercontent.com/eric-haibin-lin/verl-community/refs/heads/main/WeChat.JPG)/[Github issues](https://github.com/volcengine/verl/issues/708)!
-
-## Acknowledgement
-@vermouth1992 @ISEEKYAN @ETOgaosion @yzlnew @ShareLer @BearBiscuit05 @ccclyu @ann-qin-lu @SwordFaith @zzong2006 @zhaochenyang20 @ocss884 @eric-haibin-lin
--- a/docs/perf/perf_tuning.rst
+++ b/docs/perf/perf_tuning.rst
-Performance Tuning Guide
-==============================
-
-Author: `Guangming Sheng <https://github.com/PeterSH6>`_
-
-In this section, we will discuss how to tune the performance of all the stages in verl, including:
-
-1. Rollout generation throughput.
-
-2. Enable ``use_remove_padding=True`` for sequence packing (i.e., data packing and remove padding).
-
-3. Batch size tuning for forward and backward computation
-
-4. Enable ``use_dynamic_bsz=True`` for higher throughput.
-
-5. Utilize Ulysses Sequence Parallel for Long Context Training
-
-6. LigerKernel for SFT performance optimization
-
-Rollout Generation Tuning
--------------------------
-
-verl currently supports two rollout backends: vLLM and TGI (with SGLang support coming soon). 
-
-Below are key factors for tuning vLLM-based rollout. Before tuning, we recommend setting ``actor_rollout_ref.rollout.disable_log_stats=False`` so that rollout statistics are logged.
-
- Increase ``gpu_memory_utilization``.
-
-  - For vLLM v0.5.4 and v0.6.3, the vLLM pre-allocates GPU KVCache by using gpu_memory_utilization of the **remaining** memory. 
-  - For vLLM v0.7.0 and later, the vLLM instance will only use gpu_memory_utilization of the **total** memory.
-  - For SGLang, it's the fraction of the free GPU memory used for **static** memory like model weights and KV cache. However, the remaining (1-gpu_memory_utilization) will also be used during inference.
-
-  However, if model parameters and optimizer states are not offloaded, using too high a fraction can lead to OOM. 
-  A value between 0.5 and 0.7 often strikes a good balance between high throughput and avoiding OOM.
-
-  Note: since the definition of ``gpu_memory_utilization`` varies across inference engines, a value that works well for one engine may cause OOM for another.
-
- Adjust ``max_num_seqs`` or ``max_num_batched_tokens``.
-  If the GPU cache utilization is relatively low in the log, increase ``max_num_seqs`` or ``max_num_batched_tokens`` 
-  can enlarge the effective batch size in the decoding stage, allowing more concurrent requests per batch. 
-  We recommend setting ``max_num_batched_tokens > 2048`` for higher throughput.
-
- Use a smaller ``tensor_parallel_size``. 
-  When GPU resources allow, a smaller tensor parallel size spawns more vLLM replicas. 
-  Data parallelism (DP) can yield higher throughput than tensor parallelism (TP), but also increases KVCache consumption. 
-  Carefully balance the trade-off between more replicas and higher memory usage.
-  Our experient in Sec. 8.4 of `HybridFlow paper <https://arxiv.org/pdf/2409.19256v2>`_ evaluate this trade-off.
-
-More tuning details such as dealing with Preemption and Chunked-prefill
-can be found in `vLLM official tuning guide <https://docs.vllm.ai/en/latest/performance/optimization.html>`_ 
-
-The performance of vllm can be further increased if upgrading from v0.6.3 to v0.7. See https://github.com/volcengine/verl/blob/main/docs/README_vllm0.7.md for details on how to upgrade.
-
-Enable remove padding (sequence packing)
-----------------------------------------
-
-Currently, for llama, mistral, gemma1 and qwen based models, users can enable `use_remove_padding=True` to utilize the 
-sequence packing implementation provided by transformers library.
-
-For other models, transformers library may also support it but we haven't tested it yet.
-Users can add the desired model config to the  `test_transformer.py <https://github.com/volcengine/verl/blob/main/tests/models/test_transformer.py#L24>`_ file.
-And test its functionaility by running the following command:
-
-.. code-block:: bash
-
-  pytest -s tests/models/test_transformer.py
-
-If the test passes, you can add your desired model into the model `registry.py <https://github.com/volcengine/verl/blob/main/verl/models/registry.py#L24>`_ file.
-Then, you can enjoy the performance boost of sequence packing
-and welcome to PR your tested model to verl!
-
-
-Batch Size Tuning
-----------------
-
-To achieve higher throughput in experience preparation (i.e., model fwd) and model update (i.e., actor/critic fwd/bwd), 
-users may need to tune the ``*micro_batch_size_per_gpu`` for different computation.
-
-In verl, the core principle for setting batch sizes is:
-
- **Algorithmic metrics** (train batch size, PPO mini-batch size) are *global* (from a single-controller perspective), 
-  normalized in each worker. See the `normalization code <https://github.com/volcengine/verl/blob/main/verl/workers/fsdp_workers.py#L120-L122>`_.
-
- **Performance-related parameters** (micro batch size, max token length for dynamic batch size) are *local* parameters that define the per-GPU data allocations. 
-  See the `normalization code <https://github.com/volcengine/verl/blob/main/verl/workers/fsdp_workers.py#L127>`_.
-
-.. note:: In your training script, please use ``*micro_batch_size_per_gpu`` instead of ``*micro_batch_size``. 
-  So that you don't need to consider the normalization of the ``micro_batch_size`` and ``micro_batch_size`` will be deprecated.
-
-Batch Size Tuning tips
-""""""""""""""""""""""
-
-Therefore, users may need to tune the ``*micro_batch_size_per_gpu`` to accelerate training. Here're some tips:
-
-1. **Enable gradient checkpointing**: 
-   Set ``actor_rollout_ref.model.enable_gradient_checkpointing=True`` and ``critic.model.enable_gradient_checkpointing=True``. 
-   This often allows for larger micro-batch sizes and will be beneficial for large mini-batch training.
-
-2. Increase the ``*micro_batch_size_per_gpu`` as much as possible till equals to normalized ``mini_batch_size``.
-
-3. **Use larger forward-only parameters**: 
-   Forward only parameter, such as ``actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu``, 
-   ``actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu``, ``critic.forward_micro_batch_size_per_gpu`` could be larger (e.g., 2x) than training related micro batch sizes,
-   such as ``actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu``, ``critic.ppo_micro_batch_size_per_gpu``.
-
-4. **Allow larger micro-batch sizes for Critic and Reward models**:
-   micro batch size of Critic and Reward model could be larger than Actor model. This is because the actor model has much larger vocab size in the final layer.
-
-5. **Enable activation offloading**:
-   Set ``actor_rollout_ref.model.enable_activation_offload=True`` and ``critic.model.enable_activation_offload=True``.
-   This often works together with gradient checkpointing to get larger micro-batch sizes and it's only available in FSDP backend now.
-
-Tuning for Dynamic Batch Size
-----------------------------
-
-Dynamic batch size is a technique that allows the model to process similar number of tokens in a single forward pass (with different actual batch sizes).
-This can significantly improve the training efficiency and reduce the memory usage.
-
-To utilize this technique, users can set ``use_dynamic_bsz=True`` in actor, ref, critic and reward models.
-With ``use_dynamic_bsz=True``, users don't need to tune ``*micro_batch_size_per_gpu``. 
-Instead, users should tune the following parameters:
-
- ``actor_rollout_ref.actor.ppo_max_token_len_per_gpu``, ``critic.ppo_max_token_len_per_gpu``: 
-  The maximum number of tokens to be processed in fwd and bwd of ``update_policy`` and ``update_critic``.
-
- ``actor_rollout_ref.ref.log_prob_max_token_len_per_gpu`` and ``actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu``: 
-  The maximum number of tokens to be processed in a the fwd computation of ``compute_log_prob`` and ``comptue_ref_log_prob``.
-
- ``critic.forward_micro_batch_size_per_gpu``, ``reward_model.forward_micro_batch_size_per_gpu``: 
-  The maximum number of tokens to be processed in a the fwd computation of ``compute_values``, ``compute_rm_score``.
-
-Dynamic Batch Size Tuning tips
-""""""""""""""""""""""""""""""
-
-Here're some tips to tune the above parameters:
-
-1. **Increase** ``actor_rollout_ref.actor.ppo_max_token_len_per_gpu``  
-   Make it at least 2 x (max_prompt_length + max_response_length). We set it to 3x in `run_qwen2-7b_rm_seq_balance.sh <https://github.com/volcengine/verl/blob/main/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh#L25>`_.
-   Try to increase it to get higher throughput.
-
-2. **Forward-only parameters can be larger**: 
-   Similar to the non-dynamic-batch scenario, forward-only token limits can exceed those used in forward/backward operations.
- 
-3. **Use larger limits for Critic and Reward models**:
-   Critic and Reward parameters can be set at least 2× the Actor’s limits. For instance, we set them to 4× here:  
-   `run_qwen2-7b_rm_seq_balance.sh <https://github.com/volcengine/verl/blob/main/examples/ppo_trainer/run_qwen2-7b_rm_seq_balance.sh#L40>`_
-   
-.. :math:`\text{critic.ppo_max_token_len_per_gpu}  = 2 \times  \text{actor.ppo_max_token_len_per_gpu})`.
-
-Ulysses Sequence Parallel for Long Context Training
----------------------------------------------------
-
-To utilize this technique, users can set ``ulysses_sequence_parallel_size>1`` in actor, ref, critic and reward models.
-
-We support different model utilize different ulysses_sequence_parallel_size sizes.
-
-To train log sequence (>32k), users may need to decrease the ``*micro_batch_size_per_gpu`` and ``*max_token_len_per_gpu`` to avoid OOM.
-
-LigerKernel for SFT
----------------------
-
-LigerKernel is a high-performance kernel for Supervised Fine-Tuning (SFT) that can improve training efficiency. To enable LigerKernel in your SFT training:
-
-1. Install liger-kernel via ``pip3 install liger-kernel``. In your SFT configuration file (e.g., ``verl/trainer/config/sft_trainer.yaml``), set the ``use_liger`` parameter:
-
-   .. code-block:: yaml
-
-      model:
-        use_liger: True  # Enable LigerKernel for SFT
-
-2. The default value is ``False``. Enable it only when you want to use LigerKernel's optimizations.
-
-3. LigerKernel is particularly useful for improving training performance in SFT scenarios.
-
--- a/docs/preparation/prepare_data.rst
+++ b/docs/preparation/prepare_data.rst
-Prepare Data for Post-Training
-========================================
-
-Before starting the post-training job, we need to prepare the data for
-the policy training. The data should be stored in the parquet format.
-
-We provide several data preprocess scripts for different datasets,
-including GSM8K, MATH, HelloSwag, Full_hh_rlhf. To prepare other datasets, we need
-to follow the following steps: The data preprocess script can be divided
-into two parts:
-
-1. The first part is the common part, which loads the dataset from
-   huggingface's ``datasets`` package. Then preprocess the datasets with
-   the ``make_map_fn`` and then store in the parquet format.
-
-.. code:: python
-
-   import re
-   import os
-   import datasets
-
-   from verl.utils.hdfs_io import copy, makedirs
-   import argparse
-
-   # To extract the solution for each prompts in the dataset
-   # def extract_solution(solution_str): 
-   # ...
-
-
-   if __name__ == '__main__':
-       parser = argparse.ArgumentParser()
-       parser.add_argument('--local_dir', default='/opt/tiger/gsm8k')
-       parser.add_argument('--hdfs_dir', default=None)
-
-       args = parser.parse_args()
-
-       num_few_shot = 5
-       data_source = 'openai/gsm8k'
-
-       dataset = datasets.load_dataset(data_source, 'main')
-
-       train_dataset = dataset['train']
-       test_dataset = dataset['test']
-
-           # Construct a `def make_map_fn(split)` for the corresponding datasets.
-       # ...
-           
-       train_dataset = train_dataset.map(function=make_map_fn('train'), with_indices=True)
-       test_dataset = test_dataset.map(function=make_map_fn('test'), with_indices=True)
-
-       local_dir = args.local_dir
-       hdfs_dir = args.hdfs_dir
-
-       train_dataset.to_parquet(os.path.join(local_dir, 'train.parquet'))
-       test_dataset.to_parquet(os.path.join(local_dir, 'test.parquet'))
-
-       makedirs(hdfs_dir)
-
-       copy(src=local_dir, dst=hdfs_dir)
-
-2. The users are required to implement the ``make_map_fn()`` function
-   (as well as the ``extract_solution``) on their own to support
-   different datasets or tasks.
-
-We already implemented the data preprocess of GSM8k, MATH, Hellaswag and Full_hh_rlhf
-datasets. And we take the GSM8k dataset as an example:
-
-**GSM8K**
-
-In the ``make_map_fn``, each data field should consist of the following
-5 fields:
-
-1. ``data_source``: The name of the dataset. To index the corresponding
-   reward function in the ``RewardModule``
-2. ``prompt``: This field should be constructed in the format of
-   huggingface chat_template. The tokenizer in ``RLHFDataset`` will
-   apply chat template and tokenize the prompt.
-3. ``ability``: Define the task category.
-4. ``reward_model``: Currently, we only utilize the ``ground_truth``
-   field during evaluation. The ``ground_truth`` is computed by the
-   ``extract_solution`` function. **NOTED** that the implementation of
-   the corresponding reward function should align with this extracted
-   ``ground_truth``.
-5. ``extra_info``: Record some information of the current prompt. Not
-   use for now.
-
-.. code:: python
-
-   def extract_solution(solution_str):
-       solution = re.search("#### (\\-?[0-9\\.\\,]+)", solution_str) # extract the solution after ####
-       assert solution is not None
-       final_solution = solution.group(0)
-       final_solution = final_solution.split('#### ')[1].replace(',', '')
-       return final_solution
-
-   instruction_following = "Let's think step by step and output the final answer after \"####\"."
-
-   # add a row to each data item that represents a unique id
-   def make_map_fn(split):
-
-       def process_fn(example, idx):
-           question = example.pop('question')
-
-           question = question + ' ' + instruction_following
-
-           answer = example.pop('answer')
-           solution = extract_solution(answer)
-           data = {
-               "data_source": data_source,
-               "prompt": [{
-                   "role": "user",
-                   "content": question
-               }],
-               "ability": "math",
-               "reward_model": {
-                   "style": "rule",
-                   "ground_truth": solution
-               },
-               "extra_info": {
-                   'split': split,
-                   'index': idx
-               }
-           }
-           return data
-
-       return process_fn
--- a/docs/preparation/reward_function.rst
+++ b/docs/preparation/reward_function.rst
-Implement Reward Function for Dataset
-======================================
-
-For each dataset, we need to implement a reward function or utilize a reward model to compute the rewards for the generated responses.
-We already pre-implemented some reward functions in `reward_score directory <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score>`_.
-You can also use customized reward functions.
-
-Currently, we support reward functions for GSM8k and MATH datasets. For RLHF datasets (e.g.,
-full_hh_rlhf) and Code Generation (e.g., APPS), we utilize reward model
-and SandBox (will opensource soon) for evaluation respectively.
-
-RewardManager
-------------
-
-In the entrypoint of the PPO Post-Training script `main_ppo.py <https://github.com/volcengine/verl/blob/main/verl/trainer/main_ppo.py#L33>`_,
-we implement a ``RewardManager`` that utilize pre-implemented reward functions to compute the scores for each response.
-
-In the ``RewardManager``, we implemented a ``__call__`` function to
-compute the score for each response. 
-All the reward functions are executed by ``compute_score_fn``.
-The input is a ``DataProto``, which includes:
-
- ``input_ids``, ``attention_mask``: ``input_ids`` and ``attention_mask`` after applying
-  chat_template, including prompt and response
- ``responses``: response tokens
- ``ground_truth``: The ground truth string of the current prompt.
-  Stored in ``non_tensor_batch`` in the ``DataProto``, which should be
-  preprocessed in the parquet files.
- ``data_source``: The dataset name of the current prompt. Stored in
-  ``non_tensor_batch`` in the ``DataProto``, which should be
-  preprocessed in the parquet files.
-
-After detokenize the responses, the responses string and the ground
-truth string will be input to the ``compute_score_fn`` to compute the
-score for each response.
-
-Reward Functions
----------------
-
-Pre-implemented
-~~~~~~~~~~~~~~~
-
-We already pre-implemented some reward functions in `reward_score directory <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score>`_.
-
- In the `GSM8k example <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score/gsm8k.py>`_, we
-  force the response to output the final answer after four ####, then
-  use string matching to compare with the ground truth. If completely
-  correct, score 1 point; if the format is correct, score 0.1 points; if
-  the format is incorrect, score 0 points.
- In the `MATH example <https://github.com/volcengine/verl/blob/main/verl/utils/reward_score/math.py>`_, we follow
-  the implementation in `lm-evaluation-harness repository <https://github.com/EleutherAI/lm-evaluation-harness/blob/main/lm_eval/tasks/hendrycks_math/utils.py>`_.
-
-Customized
-~~~~~~~~~~
-
-You can implement customized reward functions in a separate file and specify them using ``custom_reward_function.path`` and ``custom_reward_function.name``. For the set of them, please refer to :ref:`config-explain-page`.
-
-The parameters of your reward function should be ``data_source``, ``solution_str``, ``ground_truth``, and ``extra_info``.
-For example:
-
-.. code:: python
-
-  def my_reward_fn(data_source, solution_str, ground_truth, extra_info=None):
-    return len(solution_str)/100
-
-If you are testing only a single customized reward function, you can simply name it 'compute_score' and leave ``custom_reward_function.name`` unset.
-
-To run multiple tests with different customized reward functions, you can modify both ``custom_reward_function.path`` and ``custom_reward_function.name`` for each trial. 
-For instance, you might create a single `my_reward.py` file and implement multiple reward functions within it. This way, for different trials, you only need to adjust ``custom_reward_function.name``, making it more convenient to conduct multiple tests within scripts.
--- a/docs/requirements-docs.txt
+++ b/docs/requirements-docs.txt
-# markdown support
-recommonmark
-myst_parser
-# markdown table support
-sphinx-markdown-tables
-
-# theme default rtd
-
-# crate-docs-theme
-sphinx-rtd-theme
-
-# pin tokenizers version to avoid env_logger version req
-tokenizers==0.19.1
--- a/docs/sglang_multiturn/multiturn.rst
+++ b/docs/sglang_multiturn/multiturn.rst
-Multi-turn Rollout Support
-==========================
-
-Basic Configuration
-~~~~~~~~~~~~~~~~~~~
-
-To enable multi-turn rollout, make sure to configure the following fields in your rollout configuration:
-
-.. code-block:: yaml
-
-    actor_rollout_ref: 
-        rollout: 
-            multi_turn: True
-            name: "sglang"
-
-These configuration activates the sglang engine for multi-turn interaction during rollout.
-
-Custom Tool Configuration
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-For custom environment interaction tools, you can implement your own tools based on ``verl.tools.base_tool.BaseTool``. Then, specify your tool configurations in a YAML file:
-
-.. code-block:: yaml
-
-    tools:
-      - class_name: ""
-        config: {}
-        tool_schema:
-
-You may refer to GSM8KTool_example_configuration_, which is one example of the tool configurations. Its implementation can be found in gsm8k_tool.py_.
-
-Finally, set the ``tools_config_file`` in your rollout config:
-
-.. code-block:: yaml
-
-    actor_rollout_ref:
-        rollout:
-            tool_kwargs:
-                tools_config_file: <path_to_tool_yaml_file>
-
-This allows integration of customized tool behaviors during actor rollout steps. 
-
-Multi-turn Tokenization
-~~~~~~~~~~~~~~~~~~~~~~~
-
-Tokenizing multi-turn rollouts poses a challenge: after applying the chat template and tokenizing the full message list, it’s hard to identify which tokens belong to assistant messages. Since the token list is flat, it lacks direct alignment with the message roles.
-
-To address this, we adopt a **delta-based tokenization** strategy. Each time the LLM generates a new message, we:
-
-1. Apply the chat template to all prior messages (`messages[:i]`).
-2. Apply the chat template again including the latest message (`messages[:i+1]`).
-3. Tokenize only the *delta* between these two serialized message strings.
-
-This ensures that only tokens generated by the assistant are included in the loss mask.
-
-.. code-block:: python
-
-    # Exclude the assistant prompt (e.g., "<|im_start|>assistant") from the loss by setting add_generation_prompt=True
-    prev = tokenizer.apply_chat_template(messages[:i], add_generation_prompt=True, tokenize=False)
-    curr = tokenizer.apply_chat_template(messages[:i+1], add_generation_prompt=False, tokenize=False)
-    token_ids += tokenizer.encode(curr[len(prev):], add_special_tokens=False)
-    loss_mask += [1] * len(token_ids)  # Mask only the new assistant tokens
-
-While we’ve validated this produces consistent results with full message tokenization, future models' chat template could break compatibility. To guard against silent inconsistencies, we compare the delta-based tokenization with full-tokenization results by default at the end of each rollout.
-
-If you see the following warning, enable `INFO` log level to inspect the mismatched outputs:
-
-.. code-block::
-
-    Inconsistent training and inference tokenization detected. This may lead to unexpected behavior during training. Please review your chat template to determine if this is intentional. For more information, refer to the multiturn README.md.
-
-If the discrepancy is expected, you can disable the sanity check via:
-
-``actor_rollout_ref.rollout.multi_turn.enable_tokenization_sanity_check=False``
-
-Special Cases
-^^^^^^^^^^^^^
-
-Some models (e.g., Qwen/QwQ-32B and Qwen3 series) remove internal reasoning content during chat template rendering. As a result, the message content can vary across turns, making the delta-based tokenization inaccurate.
-
-For example, for the following conversation:
-
-.. code-block:: python
-
-    messages = [
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": "What is 2 + 2?"},
-        {"role": "assistant", "content": "<think>user asked about a simple math question.</think> 2 + 2 = 4."},
-        {"role": "user", "content": "Explain why."},
-        {"role": "assistant", "content": "<think>user wants to know the reasoning behind the answer. Search for a good explanation</think>",
-         "tool_calls": [{"id": "tool1", "type": "search", "arguments": {"query": "Why is 2 + 2 = 4?"}}]},
-        {"role": "tool", "content": "The sum of two and two is four because it is a basic arithmetic operation."},
-        {"role": "assistant", "content": "<think>The tool provided a good explanation.</think>The sum of two and two is four because it is a basic arithmetic operation."}
-    ]
-
-1. Qwen/QwQ-32B will remove all reasoning content except the last assistant message after applying the chat template.
-
-.. code-block:: text
-
-    <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    What is 2 + 2?<|im_end|>
-    <|im_start|>assistant
-     2 + 2 = 4.<|im_end|>
-    <|im_start|>user
-    Explain why.<|im_end|>
-    <|im_start|>assistant
-    <tool_call>
-    {"name": "", "arguments": {"query": "Why is 2 + 2 = 4?"}}
-    </tool_call><|im_end|>
-    <|im_start|>user
-    <tool_response>
-    The sum of two and two is four because it is a basic arithmetic operation.
-    </tool_response><|im_end|>
-    <|im_start|>assistant
-    <think>The tool provided a good explanation.</think> The sum of two and two is four because it is a basic arithmetic operation.<|im_end|>
-
-2. Qwen3 series will remove all reasoning content before the last user message.
-
-.. code-block:: text
-
-    <|im_start|>system
-    You are a helpful assistant.<|im_end|>
-    <|im_start|>user
-    What is 2 + 2?<|im_end|>
-    <|im_start|>assistant
-     2 + 2 = 4.<|im_end|>
-    <|im_start|>user
-    Explain why.<|im_end|>
-    <|im_start|>assistant
-    <think>
-    user wants to know the reasoning behind the answer. Search for a good explanation
-    </think>
-
-    <tool_call>
-    {"name": "", "arguments": {"query": "Why is 2 + 2 = 4?"}}
-    </tool_call><|im_end|>
-    <|im_start|>user
-    <tool_response>
-    The sum of two and two is four because it is a basic arithmetic operation.
-    </tool_response><|im_end|>
-    <|im_start|>assistant
-    <think>
-    The tool provided a good explanation.
-    </think>
-
-    The sum of two and two is four because it is a basic arithmetic operation.<|im_end|>
-
-To handle this, we fall back to a **fixed base conversation** containing only a single system and user message. Since this base doesn’t include assistant messages or reasoning content, it remains consistent across turns.
-
-.. code-block:: python
-
-    BASE_CHAT_HISTORY = [
-        {"role": "system", "content": "You are a helpful assistant."},
-        {"role": "user", "content": "I am a user."}
-    ]
-    prev = tokenizer.apply_chat_template(BASE_CHAT_HISTORY, add_generation_prompt=True, tokenize=False)
-    curr = tokenizer.apply_chat_template([*BASE_CHAT_HISTORY, messages[i]], add_generation_prompt=False, tokenize=False)
-    token_ids += tokenizer.encode(curr[len(prev):], add_special_tokens=False)
-    loss_mask += [1] * len(token_ids)
-
-This method works well for Qwen3 series. However, Qwen/QwQ-32B currently has a bug in its chat template. A fix_ has been proposed but not yet adopted. Until then, use the following command to download the fixed model revision:
-
-.. code-block:: bash
-
-    pip install huggingface_hub
-    huggingface-cli download Qwen/QwQ-32B --revision refs/pr/81
-
-.. _fix: https://huggingface.co/Qwen/QwQ-32B/discussions/81
-
-Discrepancy Between Training and Inference Templates
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Although the above approach fixes the delta mismatch issue, the removal of reasoning content in the inference-time chat template introduces a new discrepancy: training uses the full reasoning content, while inference does not.
-
-This mismatch can affect model performance in unpredictable ways. To avoid it, we default to using the full response (including reasoning) for both training and rollout.
-
-However, this approach comes with trade-offs:
-
-1. Long reasoning contents can easily exceed the model’s context window, especially in multi-turn rollout.
-2. There’s a mismatch between rollout and production environment now—models will not have reasoning content from past turns if you use the default chat template in production.
-
-We are still evaluating the impact of these issues. If you experience context length problems or prefer rollouts that match production (i.e., exclude reasoning), you can enable:
-
-``actor_rollout_ref.rollout.multi_turn.use_inference_chat_template = True``
-
-GSM8K Multi-turn Training Performance  
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-See the training performance of multi-turn rollout on the GSM8K task HERE_.
-
-.. _HERE: https://wandb.ai/zhaochenyang20/gsm8k_async_rl/runs/1ro1r7om?nw=nwuserzhaochenyang20
-
-.. _GSM8KTool_example_configuration: https://github.com/volcengine/verl/blob/main/examples/sglang_multiturn/config/tool_config/gsm8k_tool_config.yaml
-
-.. _gsm8k_tool.py: https://github.com/volcengine/verl/blob/main/verl/tools/gsm8k_tool.py
-
-Search Tool Integration
-~~~~~~~~~~~~~~~~~~~~~~~
-
-.. toctree::
-   :maxdepth: 1
-
-   search_tool_example
\ No newline at end of file
--- a/docs/sglang_multiturn/sandbox_fusion.rst
+++ b/docs/sglang_multiturn/sandbox_fusion.rst
--- a/docs/sglang_multiturn/search_tool_example.rst
+++ b/docs/sglang_multiturn/search_tool_example.rst
--- a/docs/single_controller.rst
+++ b/docs/single_controller.rst
--- a/docs/start/install.rst
+++ b/docs/start/install.rst
--- a/docs/start/multinode.rst
+++ b/docs/start/multinode.rst
--- a/docs/start/quickstart.rst
+++ b/docs/start/quickstart.rst
--- a/docs/start/ray_debug_tutorial.rst
+++ b/docs/start/ray_debug_tutorial.rst
--- a/docs/workers/fsdp_workers.rst
+++ b/docs/workers/fsdp_workers.rst
--- a/docs/workers/megatron_workers.rst
+++ b/docs/workers/megatron_workers.rst
--- a/docs/workers/ray_trainer.rst
+++ b/docs/workers/ray_trainer.rst
--- a/docs/workers/sglang_worker.rst
+++ b/docs/workers/sglang_worker.rst
--- a/examples/data_preprocess/codev.py
+++ b/examples/data_preprocess/codev.py
--- a/examples/data_preprocess/prep_realbench.py
+++ b/examples/data_preprocess/prep_realbench.py
--- a/examples/data_preprocess/prep_rtllm.py
+++ b/examples/data_preprocess/prep_rtllm.py
--- a/examples/data_preprocess/prep_verilogeval.py
+++ b/examples/data_preprocess/prep_verilogeval.py
--- a/examples/tir/sandbox_fusion_verilog_config.yaml
+++ b/examples/tir/sandbox_fusion_verilog_config.yaml
--- a/examples/tir/tencent/run_tir_codev.sh
+++ b/examples/tir/tencent/run_tir_codev.sh
--- a/patches/megatron_v4.patch
+++ b/patches/megatron_v4.patch
--- a/scripts/extract_verilog.py
+++ b/scripts/extract_verilog.py
--- a/scripts/preprocess.sh
+++ b/scripts/preprocess.sh
--- a/scripts/preprocess_test.sh
+++ b/scripts/preprocess_test.sh
--- a/scripts/eval/eval.sh
+++ b/scripts/eval/eval.sh
--- a/scripts/eval/eval.txt
+++ b/scripts/eval/eval.txt
--- a/scripts/eval/eval_new/eval_new.sh
+++ b/scripts/eval/eval_new/eval_new.sh
--- a/scripts/test/test_rtllm_v1.1.sh
+++ b/scripts/test/test_rtllm_v1.1.sh
--- a/scripts/test/test_rtllm_v2.sh
+++ b/scripts/test/test_rtllm_v2.sh
--- a/scripts/test/test_verilogeval_v2.sh
+++ b/scripts/test/test_verilogeval_v2.sh
--- a/tests/README.md
+++ b/tests/README.md
--- a/tests/__init__.py
+++ b/tests/__init__.py
--- a/tests/checkpoint/run_deepseek_megatron_ckpt.sh
+++ b/tests/checkpoint/run_deepseek_megatron_ckpt.sh
--- a/tests/checkpoint/run_qwen_megatron_ckpt.sh
+++ b/tests/checkpoint/run_qwen_megatron_ckpt.sh
--- a/tests/checkpoint/test_fsdp_ckpt.py
+++ b/tests/checkpoint/test_fsdp_ckpt.py
--- a/tests/distributed/run_all.sh
+++ b/tests/distributed/run_all.sh
--- a/tests/distributed/test_tensor_dict.py
+++ b/tests/distributed/test_tensor_dict.py
--- a/tests/distro/requirements.py
+++ b/tests/distro/requirements.py
--- a/tests/e2e/__init__.py
+++ b/tests/e2e/__init__.py
--- a/tests/e2e/arithmetic_sequence/data/create_dataset.py
+++ b/tests/e2e/arithmetic_sequence/data/create_dataset.py
--- a/tests/e2e/arithmetic_sequence/model/config.json
+++ b/tests/e2e/arithmetic_sequence/model/config.json
--- a/tests/e2e/arithmetic_sequence/model/create_model_tokenizer.py
+++ b/tests/e2e/arithmetic_sequence/model/create_model_tokenizer.py
--- a/tests/e2e/arithmetic_sequence/model/generation_config.json
+++ b/tests/e2e/arithmetic_sequence/model/generation_config.json
--- a/tests/e2e/arithmetic_sequence/model/model.safetensors
+++ b/tests/e2e/arithmetic_sequence/model/model.safetensors
--- a/tests/e2e/arithmetic_sequence/model/tokenizer_config.json
+++ b/tests/e2e/arithmetic_sequence/model/tokenizer_config.json
--- a/tests/e2e/arithmetic_sequence/rl/README.md
+++ b/tests/e2e/arithmetic_sequence/rl/README.md
--- a/tests/e2e/arithmetic_sequence/rl/main_trainer.py
+++ b/tests/e2e/arithmetic_sequence/rl/main_trainer.py
--- a/tests/e2e/check_custom_rwd_fn.py
+++ b/tests/e2e/check_custom_rwd_fn.py
--- a/tests/e2e/check_results.py
+++ b/tests/e2e/check_results.py
--- a/tests/e2e/envs/__init__.py
+++ b/tests/e2e/envs/__init__.py
--- a/tests/e2e/envs/digit_completion/__init__.py
+++ b/tests/e2e/envs/digit_completion/__init__.py
--- a/tests/e2e/envs/digit_completion/task.py
+++ b/tests/e2e/envs/digit_completion/task.py
--- a/tests/e2e/envs/digit_completion/tokenizer.py
+++ b/tests/e2e/envs/digit_completion/tokenizer.py
--- a/tests/e2e/generation/run_gen_qwen05.sh
+++ b/tests/e2e/generation/run_gen_qwen05.sh
--- a/tests/e2e/ppo_trainer/expert_parallel/qwen2moe_minimal.json
+++ b/tests/e2e/ppo_trainer/expert_parallel/qwen2moe_minimal.json
--- a/tests/e2e/ppo_trainer/run_function_reward.sh
+++ b/tests/e2e/ppo_trainer/run_function_reward.sh
--- a/tests/e2e/ppo_trainer/run_model_reward.sh
+++ b/tests/e2e/ppo_trainer/run_model_reward.sh
--- a/tests/e2e/run_dapo.sh
+++ b/tests/e2e/run_dapo.sh
--- a/tests/e2e/run_deepseek_grpo.sh
+++ b/tests/e2e/run_deepseek_grpo.sh
--- a/tests/e2e/run_deepseek_grpo_megatron.sh
+++ b/tests/e2e/run_deepseek_grpo_megatron.sh
--- a/tests/e2e/run_deepseek_megatron.sh
+++ b/tests/e2e/run_deepseek_megatron.sh
--- a/tests/e2e/run_deepseek_megatron_parallelism.sh
+++ b/tests/e2e/run_deepseek_megatron_parallelism.sh
--- a/tests/e2e/run_grpo_lora_with_merge.sh
+++ b/tests/e2e/run_grpo_lora_with_merge.sh
--- a/tests/e2e/run_gsm8k_fsdp_sgl_multiturn_sf_tool.sh
+++ b/tests/e2e/run_gsm8k_fsdp_sgl_multiturn_sf_tool.sh
--- a/tests/e2e/run_gsm8k_fsdp_sgl_multiturn_w_tool.sh
+++ b/tests/e2e/run_gsm8k_fsdp_sgl_multiturn_w_tool.sh
--- a/tests/e2e/run_ppo_trainer_megatron.sh
+++ b/tests/e2e/run_ppo_trainer_megatron.sh
--- a/tests/e2e/run_prime.sh
+++ b/tests/e2e/run_prime.sh
--- a/tests/e2e/run_qwen2vl_geo3k_function_rm.sh
+++ b/tests/e2e/run_qwen2vl_geo3k_function_rm.sh
--- a/tests/e2e/run_qwen_grpo.sh
+++ b/tests/e2e/run_qwen_grpo.sh
--- a/tests/e2e/run_qwen_grpo_megatron.sh
+++ b/tests/e2e/run_qwen_grpo_megatron.sh
--- a/tests/e2e/run_qwen_gsm8k_custom_function_rm.sh
+++ b/tests/e2e/run_qwen_gsm8k_custom_function_rm.sh
--- a/tests/e2e/run_qwen_gsm8k_dapo.sh
+++ b/tests/e2e/run_qwen_gsm8k_dapo.sh
--- a/tests/e2e/run_qwen_gsm8k_function_rm.sh
+++ b/tests/e2e/run_qwen_gsm8k_function_rm.sh
--- a/tests/e2e/run_qwen_gsm8k_function_rm_both_kl.sh
+++ b/tests/e2e/run_qwen_gsm8k_function_rm_both_kl.sh
--- a/tests/e2e/run_qwen_gsm8k_function_rm_grpo.sh
+++ b/tests/e2e/run_qwen_gsm8k_function_rm_grpo.sh
--- a/tests/e2e/run_qwen_gsm8k_function_rm_no_rmpad.sh
+++ b/tests/e2e/run_qwen_gsm8k_function_rm_no_rmpad.sh
--- a/tests/e2e/run_qwen_gsm8k_function_rm_remax.sh
+++ b/tests/e2e/run_qwen_gsm8k_function_rm_remax.sh
--- a/tests/e2e/run_qwen_gsm8k_model_rm.sh
+++ b/tests/e2e/run_qwen_gsm8k_model_rm.sh
--- a/tests/e2e/run_qwen_gsm8k_model_rm_liger_kernel.sh
+++ b/tests/e2e/run_qwen_gsm8k_model_rm_liger_kernel.sh
--- a/tests/e2e/run_qwen_gsm8k_model_rm_no_rmpad.sh
+++ b/tests/e2e/run_qwen_gsm8k_model_rm_no_rmpad.sh
--- a/tests/e2e/run_qwen_gsm8k_model_rm_seq_balance.sh
+++ b/tests/e2e/run_qwen_gsm8k_model_rm_seq_balance.sh
--- a/tests/e2e/run_qwen_gsm8k_model_rm_ulysses.sh
+++ b/tests/e2e/run_qwen_gsm8k_model_rm_ulysses.sh
--- a/tests/e2e/run_qwen_gsm8k_prime.sh
+++ b/tests/e2e/run_qwen_gsm8k_prime.sh
--- a/tests/e2e/run_qwen_megatron.sh
+++ b/tests/e2e/run_qwen_megatron.sh
--- a/tests/e2e/run_qwen_megatron_parallelism.sh
+++ b/tests/e2e/run_qwen_megatron_parallelism.sh
--- a/tests/e2e/run_r1_distill_qwen_aime24_eval.sh
+++ b/tests/e2e/run_r1_distill_qwen_aime24_eval.sh
--- a/tests/e2e/run_ray_trainer.sh
+++ b/tests/e2e/run_ray_trainer.sh
--- a/tests/e2e/run_ray_trainer_fire_sampling.sh
+++ b/tests/e2e/run_ray_trainer_fire_sampling.sh
--- a/tests/e2e/run_ray_trainer_rmpad.sh
+++ b/tests/e2e/run_ray_trainer_rmpad.sh
--- a/tests/e2e/run_spin.sh
+++ b/tests/e2e/run_spin.sh
--- a/tests/e2e/run_sppo.sh
+++ b/tests/e2e/run_sppo.sh
--- a/tests/e2e/run_test.sh
+++ b/tests/e2e/run_test.sh
--- a/tests/e2e/sft/run_sft.sh
+++ b/tests/e2e/sft/run_sft.sh
--- a/tests/e2e/sft/test_sp_loss_match.py
+++ b/tests/e2e/sft/test_sp_loss_match.py
--- a/tests/generation/run_gen_qwen05.sh
+++ b/tests/generation/run_gen_qwen05.sh
--- a/tests/gpu_utility/test_memory_buffers.py
+++ b/tests/gpu_utility/test_memory_buffers.py
--- a/tests/gpu_utility/test_ops.py
+++ b/tests/gpu_utility/test_ops.py
--- a/tests/gpu_utility/test_torch_functional.py
+++ b/tests/gpu_utility/test_torch_functional.py
--- a/tests/kernels/test_linear_cross_entropy.py
+++ b/tests/kernels/test_linear_cross_entropy.py
--- a/tests/kill_github_tests.sh
+++ b/tests/kill_github_tests.sh
--- a/tests/model/test_transformer.py
+++ b/tests/model/test_transformer.py
--- a/tests/model/test_transformers_ulysses.py
+++ b/tests/model/test_transformers_ulysses.py
--- a/tests/models/test_transformer.py
+++ b/tests/models/test_transformer.py
--- a/tests/models/test_transformers_ulysses.py
+++ b/tests/models/test_transformers_ulysses.py
--- a/tests/my_test.py
+++ b/tests/my_test.py
--- a/tests/npu/run_qwen2_5_05b_dapo.sh
+++ b/tests/npu/run_qwen2_5_05b_dapo.sh
--- a/tests/npu/run_qwen2_5_05b_grpo.sh
+++ b/tests/npu/run_qwen2_5_05b_grpo.sh
--- a/tests/npu/run_qwen2_5_32b_grpo.sh
+++ b/tests/npu/run_qwen2_5_32b_grpo.sh
--- a/tests/npu/run_qwen2_5_7b_grpo.sh
+++ b/tests/npu/run_qwen2_5_7b_grpo.sh
--- a/tests/ray/check_worker_alive/main.py
+++ b/tests/ray/check_worker_alive/main.py
--- a/tests/ray/detached_worker/README.md
+++ b/tests/ray/detached_worker/README.md
--- a/tests/ray/detached_worker/client.py
+++ b/tests/ray/detached_worker/client.py
--- a/tests/ray/detached_worker/run.sh
+++ b/tests/ray/detached_worker/run.sh
--- a/tests/ray/detached_worker/server.py
+++ b/tests/ray/detached_worker/server.py
--- a/tests/ray/test_check_worker_alive.py
+++ b/tests/ray/test_check_worker_alive.py
--- a/tests/ray/test_colocated_workers.py
+++ b/tests/ray/test_colocated_workers.py
--- a/tests/ray/test_data_transfer.py
+++ b/tests/ray/test_data_transfer.py
--- a/tests/ray/test_driverfunc_to_worker.py
+++ b/tests/ray/test_driverfunc_to_worker.py
--- a/tests/ray/test_high_level_scheduling_api.py
+++ b/tests/ray/test_high_level_scheduling_api.py
--- a/tests/ray/test_ray_local_envs.py
+++ b/tests/ray/test_ray_local_envs.py
--- a/tests/ray/test_rvdz.py
+++ b/tests/ray/test_rvdz.py
--- a/tests/ray/test_worker_group_basics.py
+++ b/tests/ray/test_worker_group_basics.py
--- a/tests/ray/test_worker_group_torch.py
+++ b/tests/ray/test_worker_group_torch.py
--- a/tests/ray_cpu/check_worker_alive/main.py
+++ b/tests/ray_cpu/check_worker_alive/main.py
--- a/tests/ray_cpu/test_auto_padding.py
+++ b/tests/ray_cpu/test_auto_padding.py
--- a/tests/ray_cpu/test_check_worker_alive.py
+++ b/tests/ray_cpu/test_check_worker_alive.py
--- a/tests/ray_cpu/test_decorator.py
+++ b/tests/ray_cpu/test_decorator.py
--- a/tests/ray_cpu/test_fused_workers.py
+++ b/tests/ray_cpu/test_fused_workers.py
--- a/tests/ray_cpu/test_ray_local_envs.py
+++ b/tests/ray_cpu/test_ray_local_envs.py
--- a/tests/ray_cpu/test_ray_utils.py
+++ b/tests/ray_cpu/test_ray_utils.py
--- a/tests/ray_gpu/detached_worker/README.md
+++ b/tests/ray_gpu/detached_worker/README.md
--- a/tests/ray_gpu/detached_worker/client.py
+++ b/tests/ray_gpu/detached_worker/client.py
--- a/tests/ray_gpu/detached_worker/run.sh
+++ b/tests/ray_gpu/detached_worker/run.sh
--- a/tests/ray_gpu/detached_worker/server.py
+++ b/tests/ray_gpu/detached_worker/server.py
--- a/tests/ray_gpu/test_colocated_workers.py
+++ b/tests/ray_gpu/test_colocated_workers.py
--- a/tests/ray_gpu/test_colocated_workers_fused.py
+++ b/tests/ray_gpu/test_colocated_workers_fused.py
--- a/tests/ray_gpu/test_data_transfer.py
+++ b/tests/ray_gpu/test_data_transfer.py
--- a/tests/ray_gpu/test_driverfunc_to_worker.py
+++ b/tests/ray_gpu/test_driverfunc_to_worker.py
--- a/tests/ray_gpu/test_high_level_scheduling_api.py
+++ b/tests/ray_gpu/test_high_level_scheduling_api.py
--- a/tests/ray_gpu/test_rvdz.py
+++ b/tests/ray_gpu/test_rvdz.py
--- a/tests/ray_gpu/test_worker_group_basics.py
+++ b/tests/ray_gpu/test_worker_group_basics.py
--- a/tests/ray_gpu/test_worker_group_torch.py
+++ b/tests/ray_gpu/test_worker_group_torch.py
--- a/tests/reward_score/test_sandbox_fusion.py
+++ b/tests/reward_score/test_sandbox_fusion.py
--- a/tests/rollout/run_fsdp_vllm.py
+++ b/tests/rollout/run_fsdp_vllm.py
--- a/tests/rollout/test_sglang_spmd.py
+++ b/tests/rollout/test_sglang_spmd.py
--- a/tests/rollout/test_vllm_hf_loader.py
+++ b/tests/rollout/test_vllm_hf_loader.py
--- a/tests/rollout/test_vllm_spmd.py
+++ b/tests/rollout/test_vllm_spmd.py
--- a/tests/sandbox/test_sandbox.py
+++ b/tests/sandbox/test_sandbox.py
--- a/tests/sanity/check_license.py
+++ b/tests/sanity/check_license.py
--- a/tests/sanity/check_pr_title.py
+++ b/tests/sanity/check_pr_title.py
--- a/tests/sanity/test_config_docs.py
+++ b/tests/sanity/test_config_docs.py
--- a/tests/sanity/test_import.py
+++ b/tests/sanity/test_import.py
--- a/tests/sft/run_sft.sh
+++ b/tests/sft/run_sft.sh
--- a/tests/sft/run_sft_qwen05_peft.sh
+++ b/tests/sft/run_sft_qwen05_peft.sh
--- a/tests/sft/run_sft_qwen05_sp2_liger.sh
+++ b/tests/sft/run_sft_qwen05_sp2_liger.sh
--- a/tests/sft/run_sft_sp_loss_match.sh
+++ b/tests/sft/run_sft_sp_loss_match.sh
--- a/tests/sft/test_sp_loss_match.py
+++ b/tests/sft/test_sp_loss_match.py
--- a/tests/single_controller/__init__.py
+++ b/tests/single_controller/__init__.py
--- a/tests/single_controller/base/test_decorator.py
+++ b/tests/single_controller/base/test_decorator.py
--- a/tests/single_controller/check_worker_alive/main.py
+++ b/tests/single_controller/check_worker_alive/main.py
--- a/tests/single_controller/detached_worker/README.md
+++ b/tests/single_controller/detached_worker/README.md
--- a/tests/single_controller/detached_worker/client.py
+++ b/tests/single_controller/detached_worker/client.py
--- a/tests/single_controller/detached_worker/run.sh
+++ b/tests/single_controller/detached_worker/run.sh
--- a/tests/single_controller/detached_worker/server.py
+++ b/tests/single_controller/detached_worker/server.py
--- a/tests/single_controller/test_auto_padding_on_cpu.py
+++ b/tests/single_controller/test_auto_padding_on_cpu.py
--- a/tests/single_controller/test_colocated_workers.py
+++ b/tests/single_controller/test_colocated_workers.py
--- a/tests/single_controller/test_colocated_workers_fused.py
+++ b/tests/single_controller/test_colocated_workers_fused.py
--- a/tests/single_controller/test_data_transfer.py
+++ b/tests/single_controller/test_data_transfer.py
--- a/tests/single_controller/test_decorator_on_cpu.py
+++ b/tests/single_controller/test_decorator_on_cpu.py
--- a/tests/single_controller/test_driverfunc_to_worker.py
+++ b/tests/single_controller/test_driverfunc_to_worker.py
--- a/tests/single_controller/test_fused_workers_on_cpu.py
+++ b/tests/single_controller/test_fused_workers_on_cpu.py
--- a/tests/single_controller/test_high_level_scheduling_api.py
+++ b/tests/single_controller/test_high_level_scheduling_api.py
--- a/tests/single_controller/test_ray_local_envs_on_cpu.py
+++ b/tests/single_controller/test_ray_local_envs_on_cpu.py
--- a/tests/single_controller/test_ray_utils_on_cpu.py
+++ b/tests/single_controller/test_ray_utils_on_cpu.py
--- a/tests/single_controller/test_rvdz.py
+++ b/tests/single_controller/test_rvdz.py
--- a/tests/single_controller/test_worker_group_basics.py
+++ b/tests/single_controller/test_worker_group_basics.py
--- a/tests/single_controller/test_worker_group_torch.py
+++ b/tests/single_controller/test_worker_group_torch.py
--- a/tests/special_distributed/README.md
+++ b/tests/special_distributed/README.md
--- a/tests/special_distributed/run_all.sh
+++ b/tests/special_distributed/run_all.sh
--- a/tests/special_distributed/test_fsdp_ckpt.py
+++ b/tests/special_distributed/test_fsdp_ckpt.py
--- a/tests/special_distributed/test_tensor_dict.py
+++ b/tests/special_distributed/test_tensor_dict.py
--- a/tests/special_e2e/README.md
+++ b/tests/special_e2e/README.md
--- a/tests/special_e2e/__init__.py
+++ b/tests/special_e2e/__init__.py
--- a/tests/special_e2e/check_custom_rwd_fn.py
+++ b/tests/special_e2e/check_custom_rwd_fn.py
--- a/tests/special_e2e/check_results.py
+++ b/tests/special_e2e/check_results.py
--- a/tests/special_e2e/envs/__init__.py
+++ b/tests/special_e2e/envs/__init__.py
--- a/tests/special_e2e/envs/digit_completion/__init__.py
+++ b/tests/special_e2e/envs/digit_completion/__init__.py
--- a/tests/special_e2e/envs/digit_completion/task.py
+++ b/tests/special_e2e/envs/digit_completion/task.py
--- a/tests/special_e2e/envs/digit_completion/tokenizer.py
+++ b/tests/special_e2e/envs/digit_completion/tokenizer.py
--- a/tests/special_e2e/generation/run_gen_qwen05.sh
+++ b/tests/special_e2e/generation/run_gen_qwen05.sh
--- a/tests/special_e2e/ppo_trainer/expert_parallel/qwen2moe_minimal.json
+++ b/tests/special_e2e/ppo_trainer/expert_parallel/qwen2moe_minimal.json
--- a/tests/special_e2e/ppo_trainer/run_function_reward.sh
+++ b/tests/special_e2e/ppo_trainer/run_function_reward.sh
--- a/tests/special_e2e/ppo_trainer/run_model_reward.sh
+++ b/tests/special_e2e/ppo_trainer/run_model_reward.sh
--- a/tests/special_e2e/run_dapo.sh
+++ b/tests/special_e2e/run_dapo.sh
--- a/tests/special_e2e/run_grpo_lora_with_merge.sh
+++ b/tests/special_e2e/run_grpo_lora_with_merge.sh
--- a/tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_sf_tool.sh
+++ b/tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_sf_tool.sh
--- a/tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_w_tool.sh
+++ b/tests/special_e2e/run_gsm8k_fsdp_sgl_multiturn_w_tool.sh
--- a/tests/special_e2e/run_ppo_trainer_megatron.sh
+++ b/tests/special_e2e/run_ppo_trainer_megatron.sh
--- a/tests/special_e2e/run_prime.sh
+++ b/tests/special_e2e/run_prime.sh
--- a/tests/special_e2e/run_r1_distill_qwen_aime24_eval.sh
+++ b/tests/special_e2e/run_r1_distill_qwen_aime24_eval.sh
--- a/tests/special_e2e/run_spin.sh
+++ b/tests/special_e2e/run_spin.sh
--- a/tests/special_e2e/run_sppo.sh
+++ b/tests/special_e2e/run_sppo.sh
--- a/tests/special_e2e/run_test.sh
+++ b/tests/special_e2e/run_test.sh
--- a/tests/special_e2e/sft/run_sft.sh
+++ b/tests/special_e2e/sft/run_sft.sh
--- a/tests/special_e2e/sft/test_sp_loss_match.py
+++ b/tests/special_e2e/sft/test_sp_loss_match.py
--- a/tests/special_npu/run_qwen2_5_05b_dapo.sh
+++ b/tests/special_npu/run_qwen2_5_05b_dapo.sh
--- a/tests/special_npu/run_qwen2_5_05b_grpo.sh
+++ b/tests/special_npu/run_qwen2_5_05b_grpo.sh
--- a/tests/special_npu/run_qwen2_5_32b_grpo.sh
+++ b/tests/special_npu/run_qwen2_5_32b_grpo.sh
--- a/tests/special_npu/run_qwen2_5_7b_grpo.sh
+++ b/tests/special_npu/run_qwen2_5_7b_grpo.sh
--- a/tests/special_sanity/check_license.py
+++ b/tests/special_sanity/check_license.py
--- a/tests/special_sanity/check_pr_title.py
+++ b/tests/special_sanity/check_pr_title.py
--- a/tests/special_sanity/test_config_docs.py
+++ b/tests/special_sanity/test_config_docs.py
--- a/tests/special_sanity/test_import.py
+++ b/tests/special_sanity/test_import.py
--- a/tests/special_sanity/type_coverage_check.py
+++ b/tests/special_sanity/type_coverage_check.py
--- a/tests/special_sanity/validate_imported_docs.py
+++ b/tests/special_sanity/validate_imported_docs.py
--- a/tests/special_sanity/validate_structure.py
+++ b/tests/special_sanity/validate_structure.py
--- a/tests/special_standalone/README.md
+++ b/tests/special_standalone/README.md
--- a/tests/special_standalone/test_memory_buffers.py
+++ b/tests/special_standalone/test_memory_buffers.py
--- a/tests/test_protocol.py
+++ b/tests/test_protocol.py
--- a/tests/test_protocol_on_cpu.py
+++ b/tests/test_protocol_on_cpu.py
--- a/tests/tools/test_base_tool_on_cpu.py
+++ b/tests/tools/test_base_tool_on_cpu.py
--- a/tests/trainer/__init__.py
+++ b/tests/trainer/__init__.py
--- a/tests/trainer/ppo/__init__.py
+++ b/tests/trainer/ppo/__init__.py
--- a/tests/trainer/ppo/test_core_algos.py
+++ b/tests/trainer/ppo/test_core_algos.py
--- a/tests/trainer/ppo/test_core_algos_on_cpu.py
+++ b/tests/trainer/ppo/test_core_algos_on_cpu.py
--- a/tests/trainer/ppo/test_metric_utils.py
+++ b/tests/trainer/ppo/test_metric_utils.py
--- a/tests/trainer/ppo/test_metric_utils_on_cpu.py
+++ b/tests/trainer/ppo/test_metric_utils_on_cpu.py
--- a/tests/utility/test_tensor_dict_utilities.py
+++ b/tests/utility/test_tensor_dict_utilities.py
--- a/tests/utils/_test_module.py
+++ b/tests/utils/_test_module.py
--- a/tests/utils/cpu_tests/_test_module.py
+++ b/tests/utils/cpu_tests/_test_module.py
--- a/tests/utils/cpu_tests/test_fs.py
+++ b/tests/utils/cpu_tests/test_fs.py
--- a/tests/utils/cpu_tests/test_import_utils.py
+++ b/tests/utils/cpu_tests/test_import_utils.py
--- a/tests/utils/cpu_tests/test_model.py
+++ b/tests/utils/cpu_tests/test_model.py
--- a/tests/utils/cpu_tests/test_timeout_decorator.py
+++ b/tests/utils/cpu_tests/test_timeout_decorator.py
--- a/tests/utils/dataset/test_multiturn_sft_dataset_on_cpu.py
+++ b/tests/utils/dataset/test_multiturn_sft_dataset_on_cpu.py
--- a/tests/utils/dataset/test_rl_dataset_on_cpu.py
+++ b/tests/utils/dataset/test_rl_dataset_on_cpu.py
--- a/tests/utils/dataset/test_sft_dataset_on_cpu.py
+++ b/tests/utils/dataset/test_sft_dataset_on_cpu.py
--- a/tests/utils/gpu_tests/checkpoint/test_fsdp_ckpt.py
+++ b/tests/utils/gpu_tests/checkpoint/test_fsdp_ckpt.py
--- a/tests/utils/gpu_tests/dataset/test_multiturn_sft_dataset.py
+++ b/tests/utils/gpu_tests/dataset/test_multiturn_sft_dataset.py
--- a/tests/utils/gpu_tests/dataset/test_rl_dataset.py
+++ b/tests/utils/gpu_tests/dataset/test_rl_dataset.py
--- a/tests/utils/gpu_tests/dataset/test_rm_dataset.py
+++ b/tests/utils/gpu_tests/dataset/test_rm_dataset.py
--- a/tests/utils/gpu_tests/dataset/test_sft_dataset.py
+++ b/tests/utils/gpu_tests/dataset/test_sft_dataset.py
--- a/tests/utils/gpu_tests/megatron/test_pipeline_parallel.py
+++ b/tests/utils/gpu_tests/megatron/test_pipeline_parallel.py
--- a/tests/utils/gpu_tests/test_activation_offload.py
+++ b/tests/utils/gpu_tests/test_activation_offload.py
--- a/tests/utils/gpu_tests/test_flops_counter.py
+++ b/tests/utils/gpu_tests/test_flops_counter.py
--- a/tests/utils/gpu_tests/test_seqlen_balancing.py
+++ b/tests/utils/gpu_tests/test_seqlen_balancing.py
--- a/tests/utils/gpu_tests/test_torch_functional.py
+++ b/tests/utils/gpu_tests/test_torch_functional.py
--- a/tests/utils/megatron/test_pipeline_parallel.py
+++ b/tests/utils/megatron/test_pipeline_parallel.py
--- a/tests/utils/reward_score/reward_score/test_sandbox_fusion_on_cpu.py
+++ b/tests/utils/reward_score/reward_score/test_sandbox_fusion_on_cpu.py
--- a/tests/utils/reward_score/test_sandbox.py
+++ b/tests/utils/reward_score/test_sandbox.py
--- a/tests/utils/reward_score/test_sandbox_on_cpu.py
+++ b/tests/utils/reward_score/test_sandbox_on_cpu.py
--- a/tests/utils/test_activation_offload.py
+++ b/tests/utils/test_activation_offload.py
--- a/tests/utils/test_flops_counter.py
+++ b/tests/utils/test_flops_counter.py
--- a/tests/utils/test_fs_on_cpu.py
+++ b/tests/utils/test_fs_on_cpu.py
--- a/tests/utils/test_import_utils_on_cpu.py
+++ b/tests/utils/test_import_utils_on_cpu.py
--- a/tests/utils/test_linear_cross_entropy.py
+++ b/tests/utils/test_linear_cross_entropy.py
--- a/tests/utils/test_linear_cross_entropy_tp.py
+++ b/tests/utils/test_linear_cross_entropy_tp.py
--- a/tests/utils/test_model_on_cpu.py
+++ b/tests/utils/test_model_on_cpu.py
--- a/tests/utils/test_seqlen_balancing.py
+++ b/tests/utils/test_seqlen_balancing.py
--- a/tests/utils/test_timeout_decorator_cpu.py
+++ b/tests/utils/test_timeout_decorator_cpu.py
--- a/tests/utils/test_torch_functional.py
+++ b/tests/utils/test_torch_functional.py
--- a/tests/verl/utils/dataset/test_rl_dataset.py
+++ b/tests/verl/utils/dataset/test_rl_dataset.py
--- a/tests/verl/utils/dataset/test_rm_dataset.py
+++ b/tests/verl/utils/dataset/test_rm_dataset.py
--- a/tests/verl/utils/dataset/test_sft_dataset.py
+++ b/tests/verl/utils/dataset/test_sft_dataset.py
--- a/tests/workers/reward_manager/test_registry.py
+++ b/tests/workers/reward_manager/test_registry.py
--- a/tests/workers/reward_manager/test_registry_on_cpu.py
+++ b/tests/workers/reward_manager/test_registry_on_cpu.py
--- a/tests/workers/rollout/async_rollout_utils.py
+++ b/tests/workers/rollout/async_rollout_utils.py
--- a/tests/workers/rollout/perf/vllm_async_rollout.py
+++ b/tests/workers/rollout/perf/vllm_async_rollout.py
--- a/tests/workers/rollout/resource/tool_configs/sandbox_fusion_tool_config
+++ b/tests/workers/rollout/resource/tool_configs/sandbox_fusion_tool_config
--- a/tests/workers/rollout/resource/tool_configs/search_tool_config
+++ b/tests/workers/rollout/resource/tool_configs/search_tool_config
--- a/tests/workers/rollout/rollout_vllm/run_fsdp_vllm.py
+++ b/tests/workers/rollout/rollout_vllm/run_fsdp_vllm.py
--- a/tests/workers/rollout/rollout_vllm/test_vllm_chat_scheduler.py
+++ b/tests/workers/rollout/rollout_vllm/test_vllm_chat_scheduler.py
--- a/tests/workers/rollout/rollout_vllm/test_vllm_hf_loader.py
+++ b/tests/workers/rollout/rollout_vllm/test_vllm_hf_loader.py
--- a/tests/workers/rollout/rollout_vllm/test_vllm_spmd.py
+++ b/tests/workers/rollout/rollout_vllm/test_vllm_spmd.py
--- a/tests/workers/rollout/run_fsdp_vllm.py
+++ b/tests/workers/rollout/run_fsdp_vllm.py
--- a/tests/workers/rollout/test_async_sglang_server.py
+++ b/tests/workers/rollout/test_async_sglang_server.py
--- a/tests/workers/rollout/test_custom_completion_callback.py
+++ b/tests/workers/rollout/test_custom_completion_callback.py
--- a/tests/workers/rollout/test_hf_rollout.py
+++ b/tests/workers/rollout/test_hf_rollout.py
--- a/tests/workers/rollout/test_sglang_async_rollout_search_tools.py
+++ b/tests/workers/rollout/test_sglang_async_rollout_search_tools.py
--- a/tests/workers/rollout/test_sglang_async_rollout_sf_tools.py
+++ b/tests/workers/rollout/test_sglang_async_rollout_sf_tools.py
--- a/tests/workers/rollout/test_sglang_async_rollout_w_tools.py
+++ b/tests/workers/rollout/test_sglang_async_rollout_w_tools.py
--- a/tests/workers/rollout/test_sglang_spmd.py
+++ b/tests/workers/rollout/test_sglang_spmd.py
--- a/tests/workers/rollout/test_vllm_chat_scheduler.py
+++ b/tests/workers/rollout/test_vllm_chat_scheduler.py
--- a/tests/workers/rollout/test_vllm_hf_loader.py
+++ b/tests/workers/rollout/test_vllm_hf_loader.py
--- a/tests/workers/rollout/test_vllm_spmd.py
+++ b/tests/workers/rollout/test_vllm_spmd.py
--- a/tests/workers/rollout/utils_sglang.py
+++ b/tests/workers/rollout/utils_sglang.py
--- a/verl/__init__.py
+++ b/verl/__init__.py
--- a/verl/trainer/config/generation.yaml
+++ b/verl/trainer/config/generation.yaml
--- a/verl/trainer/custom_generation.py
+++ b/verl/trainer/custom_generation.py
--- a/verl/trainer/ppo/ray_trainer.py
+++ b/verl/trainer/ppo/ray_trainer.py
--- a/verl/workers/rollout/sglang_rollout/sglang_rollout.py
+++ b/verl/workers/rollout/sglang_rollout/sglang_rollout.py