update

11cc0595 · Yaoyu Zhu · 38f6c844 · 11cc0595 · 11cc0595 · 11cc0595
Commit 11cc0595 authored May 08, 2025 by Yaoyu Zhu
8 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,7 @@
 **/ret_one
 **/tmp
 **/results
+/data/
 *.slurm*
 *.nfs*
 /*.sh*

--- a/plot_and_analyze/plot_timing.py
+++ b/plot_and_analyze/plot_timing.py
+import pandas as pd
+from pathlib import Path
+import matplotlib.pyplot as plt
+import argparse
+def process_folder(folder):
+    """处理单个文件夹，返回步骤、时间和响应长度数据"""
+    csv_path = Path(folder) / 'stats.csv'
+    df = pd.read_csv(csv_path)
+    # 筛选以 'timing_s/step' 开头的列
+    timing_cols = [col for col in df.columns if col.startswith('timing_s/step')]
+    # 假设存在'response_length/mean' 列
+    if'response_length/mean' in df.columns:
+        response_length_col = ['step','response_length/mean']
+    else:
+        response_length_col = []
+    # 提取步骤和对应的时间数据
+    df_timing = df[['step'] + timing_cols]
+    # 转换为长格式数据（步骤-时间对应关系）
+    df_timing = pd.melt(df_timing, id_vars='step', var_name='timing_s/step', value_name='time')
+    # 筛选步骤小于等于 55 的数据
+    df_timing = df_timing[df_timing['step'] <= 55]
+    if response_length_col:
+        df_response = df[response_length_col]
+        df_response = df_response[df_response['step'] <= 55]
+        return df_timing, df_response
+    return df_timing, None
+def plot_timing_comparison(folders, labels):
+    """绘制两个路径的时间和响应长度对比折线图（双 Y 轴）"""
+    fig, ax1 = plt.subplots(figsize=(12, 6))
+    ax2 = ax1.twinx()
+    for folder, label in zip(folders, labels):
+        df_timing, df_response = process_folder(folder)
+        # 计算每个步骤的平均值（如果有多个样本）或直接使用单样本数据
+        # 假设每行是一个样本，这里取平均值（可根据数据结构调整）
+        df_step_avg = df_timing.groupby('step')['time'].mean().reset_index()
+        ax1.plot(df_step_avg['step'], df_step_avg['time'], marker=None, linestyle='-',
+                 linewidth=2, label=f'{label} Timing')
+        if df_response is not None:
+            ax2.plot(df_response['step'], df_response['response_length/mean'], marker=None, linestyle='--',
+                     linewidth=2, label=f'{label} Response Length')
+    ax1.set_xlabel('Step')
+    ax1.set_ylabel('Time (seconds)', color='b')
+    ax2.set_ylabel('Response Length (mean)', color='r')
+    plt.title('Timing and Response Length Comparison Between Accelerated and Non-Accelerated Versions')
+    # 合并两个图例
+    lines, labels = ax1.get_legend_handles_labels()
+    lines2, labels2 = ax2.get_legend_handles_labels()
+    ax2.legend(lines + lines2, labels + labels2, loc='upper left')
+    ax1.grid(True, linestyle='--', alpha=0.7)
+    plt.tight_layout()
+    # 保存或显示图表
+    save_dir = 'results/figures'
+    import os
+    os.makedirs(save_dir, exist_ok=True)
+    plt.savefig(os.path.join(save_dir, 'timing_comparison.png'))
+    plt.show()
+if __name__ == '__main__':
+    folders = [
+        'results/codev_3.1k_dapo_accelerate',
+        'results/codev_3.1k_dapo_no_accelerate'
+    ]
+    labels = ['Accelerated', 'Non-Accelerated']  # 对应每个文件夹的标签
+    plot_timing_comparison(folders, labels)
\ No newline at end of file
--- a/recipe/dapo/run_dapo_codev_7b_14k.sh
+++ b/recipe/dapo/run_dapo_codev_7b_14k.sh
+#!/bin/bash
+set -x
+set -euxo pipefail
+project_name='DAPO'
+exp_name='DAPO-Early-Qwen2.5-32B'
+adv_estimator=grpo
+kl_coef=0.0
+kl_loss_coef=0.0
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 1))
+overlong_penalty_factor=1.0
+# An early version for DAPO
+enable_filter_groups=True
+train_prompt_bsz=128
+train_prompt_mini_bsz=64
+n_resp_per_prompt=16
+use_token_level_loss=True
+# Ray
+RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+NNODES=${NNODES:-16}
+# Paths
+# Algorithm
+## Train
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 16))
+## Validation
+val_top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+# Performance Related Parameter
+sp_size=8
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
+infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
+offload=True
+gen_tp=4
+ppo_max_token_len_per_gpu=32768
+num_gpu=$(($USER_GPUS_PER_NODE * $SLURM_JOB_NUM_NODES))
+export VLLM_USE_V1=1
+echo "$WANDB_DIR"
+echo "$SAVE_DIR"
+echo "$WANDB_API_KEY"
+# Set default model path if not provided
+MODEL_PATH="/nfs_global/S/lvhanqi/LLaMA-Factory/saves/Qwen2.5-Coder-7B-Instruct-codev-r1-87k/full/sft_6epoch"
+# Train over a single node, 8 A100-80GB GPUs.
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/qwen7b32b_filter_gt_r1_14k/train.parquet \
+    data.val_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/qwen7b32b_filter_gt_r1_14k/test.parquet \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.val_batch_size=512 \
+    data.max_prompt_length=2048 \
+    data.max_response_length=$max_response_length \
+    algorithm.filter_groups.enable=${enable_filter_groups} \
+    algorithm.filter_groups.max_num_gen_batches=999 \
+    algorithm.filter_groups.metric=acc \
+    data.gen_batch_size=$((($train_prompt_bsz + $num_gpu - 1) / $num_gpu * $num_gpu)) \
+    actor_rollout_ref.model.path=$MODEL_PATH \
+    +actor_rollout_ref.model.override_config.attention_dropout=0. \
+    +actor_rollout_ref.model.override_config.embd_pdrop=0. \
+    +actor_rollout_ref.model.override_config.resid_pdrop=0. \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    +actor_rollout_ref.model.use_liger=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.weight_decay=0.0 \
+    actor_rollout_ref.actor.use_dynamic_bsz=True\
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${ppo_max_token_len_per_gpu} \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.actor.kl_loss_coef=0.00 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=0.5 \
+    actor_rollout_ref.actor.use_token_level_loss=${use_token_level_loss} \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=$(($ppo_max_token_len_per_gpu*2)) \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    actor_rollout_ref.rollout.val_kwargs.n=4 \
+    actor_rollout_ref.rollout.temperature=1.0 \
+    actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.free_cache_engine=False \
+    reward_model.reward_manager=prime \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    custom_reward_function.overlong_buffer.enable=${enable_overlong_buffer} \
+    custom_reward_function.overlong_buffer.len=${overlong_buffer_len} \
+    custom_reward_function.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
+    custom_reward_function.train.path=verl/utils/reward_score/codev.py \
+    custom_reward_function.train.name=compute_score_wrapper \
+    custom_reward_function.train.continuous_reward.enable=False \
+    algorithm.kl_ctrl.kl_coef=0.0 \
+    trainer.critic_warmup=0 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name='codev' \
+    trainer.experiment_name='codev-7b-3.1kdata' \
+    trainer.n_gpus_per_node=$USER_GPUS_PER_NODE \
+    trainer.nnodes=$SLURM_JOB_NUM_NODES \
+    +trainer.val_before_train=False \
+    trainer.default_local_dir=$SAVE_DIR \
+    trainer.resume_mode=auto \
+    trainer.default_hdfs_dir=null \
+    trainer.save_freq=20 \
+    trainer.test_freq=20 \
+    trainer.total_epochs=100 "${@:1}"
+    # custom_reward_function.path=/nfs_global/S/zhuyaoyu/projects/dapo/verl/utils/reward_score/codev.py \
\ No newline at end of file
--- a/recipe/dapo/run_dapo_codev_7b_16k.sh
+++ b/recipe/dapo/run_dapo_codev_7b_16k.sh
@@ -14,14 +14,13 @@ clip_ratio_low=0.2
 clip_ratio_high=0.28
 enable_overlong_buffer=True
-overlong_buffer_len=$((1024 * 4))
+overlong_buffer_len=$((1024 * 1))
 overlong_penalty_factor=1.0
 # An early version for DAPO
 enable_filter_groups=True
-gen_prompt_bsz=512 # NOTE: no filtering here
+train_prompt_bsz=128
-train_prompt_bsz=512
+train_prompt_mini_bsz=64
-train_prompt_mini_bsz=32
 n_resp_per_prompt=16
 use_token_level_loss=True
@@ -34,7 +33,7 @@ NNODES=${NNODES:-16}
 # Algorithm
 ## Train
 max_prompt_length=$((1024 * 2))
-max_response_length=$((1024 * 20))
+max_response_length=$((1024 * 16))
 ## Validation
 val_top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
@@ -46,8 +45,8 @@ actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
 infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
 offload=True
 gen_tp=4
+ppo_max_token_len_per_gpu=32768
+num_gpu=$(($USER_GPUS_PER_NODE * $SLURM_JOB_NUM_NODES))
 export VLLM_USE_V1=1
@@ -64,27 +63,29 @@ MODEL_PATH="/nfs_global/S/lvhanqi/LLaMA-Factory/saves/Qwen2.5-Coder-7B-Instruct-
 python3 -m verl.trainer.main_ppo \
    algorithm.adv_estimator=grpo \
    data.train_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/16k_r1_filtered/train.parquet \
-    data.val_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1_1/10k_qwq/test.parquet  \
+    data.val_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/16k_r1_filtered/test.parquet \
-    data.train_batch_size=128 \
+    data.train_batch_size=${train_prompt_bsz} \
    data.val_batch_size=512 \
    data.max_prompt_length=2048 \
-    data.max_response_length=16384 \
+    data.max_response_length=$max_response_length \
    algorithm.filter_groups.enable=${enable_filter_groups} \
    algorithm.filter_groups.max_num_gen_batches=999 \
    algorithm.filter_groups.metric=acc \
-    actor_rollout_ref.model.path=$MODEL_PATH  \
+    data.gen_batch_size=$((($train_prompt_bsz + $num_gpu - 1) / $num_gpu * $num_gpu)) \
+    actor_rollout_ref.model.path=$MODEL_PATH \
    +actor_rollout_ref.model.override_config.attention_dropout=0. \
    +actor_rollout_ref.model.override_config.embd_pdrop=0. \
    +actor_rollout_ref.model.override_config.resid_pdrop=0. \
    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    +actor_rollout_ref.model.use_liger=True \
    actor_rollout_ref.actor.optim.lr=1e-6 \
    actor_rollout_ref.actor.optim.weight_decay=0.0 \
    actor_rollout_ref.actor.use_dynamic_bsz=True\
-    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=32768 \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${ppo_max_token_len_per_gpu} \
    actor_rollout_ref.model.use_remove_padding=True \
    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
-    actor_rollout_ref.actor.ppo_mini_batch_size=64 \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
    actor_rollout_ref.actor.use_kl_loss=False \
    actor_rollout_ref.actor.kl_loss_coef=0.00 \
    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
@@ -94,15 +95,15 @@ python3 -m verl.trainer.main_ppo \
    actor_rollout_ref.model.enable_gradient_checkpointing=True \
    actor_rollout_ref.actor.fsdp_config.param_offload=False \
    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
-    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=32768 \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=$(($ppo_max_token_len_per_gpu*2)) \
    actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
    actor_rollout_ref.rollout.name=vllm \
-    actor_rollout_ref.rollout.n=16 \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
-    actor_rollout_ref.rollout.val_kwargs.n=1 \
+    actor_rollout_ref.rollout.val_kwargs.n=4 \
    actor_rollout_ref.rollout.temperature=1.0 \
    actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
-    actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
    actor_rollout_ref.rollout.enforce_eager=False \
    actor_rollout_ref.rollout.free_cache_engine=False \
    reward_model.reward_manager=prime \
@@ -110,17 +111,23 @@ python3 -m verl.trainer.main_ppo \
    custom_reward_function.overlong_buffer.enable=${enable_overlong_buffer} \
    custom_reward_function.overlong_buffer.len=${overlong_buffer_len} \
    custom_reward_function.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
+    custom_reward_function.train.path=verl/utils/reward_score/codev.py \
+    custom_reward_function.train.name=compute_score_wrapper \
+    custom_reward_function.train.continuous_reward.enable=False \
    algorithm.kl_ctrl.kl_coef=0.0 \
    trainer.critic_warmup=0 \
    trainer.logger=['console','wandb'] \
    trainer.project_name='codev' \
-    trainer.experiment_name='codev-7b-16k' \
+    trainer.experiment_name='codev-7b-3.1kdata' \
    trainer.n_gpus_per_node=$USER_GPUS_PER_NODE \
    trainer.nnodes=$SLURM_JOB_NUM_NODES \
    +trainer.val_before_train=False \
    trainer.default_local_dir=$SAVE_DIR \
    trainer.resume_mode=auto \
    trainer.default_hdfs_dir=null \
-    trainer.save_freq=15 \
+    trainer.save_freq=20 \
-    trainer.test_freq=200 \
+    trainer.test_freq=20 \
    trainer.total_epochs=100 "${@:1}"
\ No newline at end of file
+    # custom_reward_function.path=/nfs_global/S/zhuyaoyu/projects/dapo/verl/utils/reward_score/codev.py \
\ No newline at end of file
--- a/recipe/dapo/run_dapo_codev_7b_3.1k.sh
+++ b/recipe/dapo/run_dapo_codev_7b_3.1k.sh
+#!/bin/bash
+set -x
+set -euxo pipefail
+project_name='DAPO'
+exp_name='DAPO-Early-Qwen2.5-32B'
+adv_estimator=grpo
+kl_coef=0.0
+kl_loss_coef=0.0
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 1))
+overlong_penalty_factor=1.0
+# An early version for DAPO
+enable_filter_groups=True
+train_prompt_bsz=128
+train_prompt_mini_bsz=64
+n_resp_per_prompt=16
+use_token_level_loss=True
+# Ray
+RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+NNODES=${NNODES:-16}
+# Paths
+# Algorithm
+## Train
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 16))
+## Validation
+val_top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+# Performance Related Parameter
+sp_size=8
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
+infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
+offload=True
+gen_tp=4
+ppo_max_token_len_per_gpu=32768
+num_gpu=$(($USER_GPUS_PER_NODE * $SLURM_JOB_NUM_NODES))
+export VLLM_USE_V1=1
+echo "$WANDB_DIR"
+echo "$SAVE_DIR"
+echo "$WANDB_API_KEY"
+# Set default model path if not provided
+MODEL_PATH="/nfs_global/S/lvhanqi/LLaMA-Factory/saves/Qwen2.5-Coder-7B-Instruct-codev-r1-87k/full/sft_6epoch"
+# Train over a single node, 8 A100-80GB GPUs.
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/3.1k_r1_filtered/train.parquet \
+    data.val_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/3.1k_r1_filtered/test.parquet \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.val_batch_size=512 \
+    data.max_prompt_length=2048 \
+    data.max_response_length=$max_response_length \
+    algorithm.filter_groups.enable=${enable_filter_groups} \
+    algorithm.filter_groups.max_num_gen_batches=999 \
+    algorithm.filter_groups.metric=acc \
+    data.gen_batch_size=$((($train_prompt_bsz + $num_gpu - 1) / $num_gpu * $num_gpu)) \
+    actor_rollout_ref.model.path=$MODEL_PATH \
+    +actor_rollout_ref.model.override_config.attention_dropout=0. \
+    +actor_rollout_ref.model.override_config.embd_pdrop=0. \
+    +actor_rollout_ref.model.override_config.resid_pdrop=0. \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    +actor_rollout_ref.model.use_liger=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.weight_decay=0.0 \
+    actor_rollout_ref.actor.use_dynamic_bsz=True\
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${ppo_max_token_len_per_gpu} \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.actor.kl_loss_coef=0.00 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=0.5 \
+    actor_rollout_ref.actor.use_token_level_loss=${use_token_level_loss} \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=$(($ppo_max_token_len_per_gpu*2)) \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    actor_rollout_ref.rollout.val_kwargs.n=4 \
+    actor_rollout_ref.rollout.temperature=1.0 \
+    actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.free_cache_engine=False \
+    reward_model.reward_manager=prime \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    custom_reward_function.overlong_buffer.enable=${enable_overlong_buffer} \
+    custom_reward_function.overlong_buffer.len=${overlong_buffer_len} \
+    custom_reward_function.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
+    custom_reward_function.train.path=verl/utils/reward_score/codev.py \
+    custom_reward_function.train.name=compute_score_wrapper \
+    custom_reward_function.train.continuous_reward.enable=False \
+    algorithm.kl_ctrl.kl_coef=0.0 \
+    trainer.critic_warmup=0 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name='codev' \
+    trainer.experiment_name='codev-7b-3.1kdata' \
+    trainer.n_gpus_per_node=$USER_GPUS_PER_NODE \
+    trainer.nnodes=$SLURM_JOB_NUM_NODES \
+    +trainer.val_before_train=False \
+    trainer.default_local_dir=$SAVE_DIR \
+    trainer.resume_mode=auto \
+    trainer.default_hdfs_dir=null \
+    trainer.save_freq=20 \
+    trainer.test_freq=20 \
+    trainer.total_epochs=100 "${@:1}"
+    # custom_reward_function.path=/nfs_global/S/zhuyaoyu/projects/dapo/verl/utils/reward_score/codev.py \
\ No newline at end of file
--- a/recipe/dapo/run_dapo_codev_7b_3.1k_accelerate.sh
+++ b/recipe/dapo/run_dapo_codev_7b_3.1k_accelerate.sh
+#!/bin/bash
+set -x
+set -euxo pipefail
+project_name='DAPO'
+exp_name='DAPO-Early-Qwen2.5-32B'
+adv_estimator=grpo
+kl_coef=0.0
+kl_loss_coef=0.0
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 1))
+overlong_penalty_factor=1.0
+# An early version for DAPO
+enable_filter_groups=True
+train_prompt_bsz=128
+train_prompt_mini_bsz=64
+n_resp_per_prompt=16
+use_token_level_loss=True
+# Ray
+RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+NNODES=${NNODES:-16}
+# Paths
+# Algorithm
+## Train
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 16))
+## Validation
+val_top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+# Performance Related Parameter
+sp_size=8
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
+infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
+offload=True
+gen_tp=4
+ppo_max_token_len_per_gpu=32768
+num_gpu=$(($USER_GPUS_PER_NODE * $SLURM_JOB_NUM_NODES))
+export VLLM_USE_V1=1
+echo "$WANDB_DIR"
+echo "$SAVE_DIR"
+echo "$WANDB_API_KEY"
+# Set default model path if not provided
+MODEL_PATH="/nfs_global/S/lvhanqi/LLaMA-Factory/saves/Qwen2.5-Coder-7B-Instruct-codev-r1-87k/full/sft_6epoch"
+# Train over a single node, 8 A100-80GB GPUs.
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/3.1k_r1_filtered/train.parquet \
+    data.val_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/3.1k_r1_filtered/test.parquet \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.val_batch_size=512 \
+    data.max_prompt_length=2048 \
+    data.max_response_length=$max_response_length \
+    algorithm.filter_groups.enable=${enable_filter_groups} \
+    algorithm.filter_groups.max_num_gen_batches=999 \
+    algorithm.filter_groups.metric=acc \
+    data.gen_batch_size=$((($train_prompt_bsz + $num_gpu - 1) / $num_gpu * $num_gpu)) \
+    actor_rollout_ref.model.path=$MODEL_PATH \
+    +actor_rollout_ref.model.override_config.attention_dropout=0. \
+    +actor_rollout_ref.model.override_config.embd_pdrop=0. \
+    +actor_rollout_ref.model.override_config.resid_pdrop=0. \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    +actor_rollout_ref.model.use_liger=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.weight_decay=0.0 \
+    actor_rollout_ref.actor.use_dynamic_bsz=True\
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${ppo_max_token_len_per_gpu} \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.actor.kl_loss_coef=0.00 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=0.5 \
+    actor_rollout_ref.actor.use_token_level_loss=${use_token_level_loss} \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=$(($ppo_max_token_len_per_gpu*2)) \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    actor_rollout_ref.rollout.val_kwargs.n=4 \
+    actor_rollout_ref.rollout.temperature=1.0 \
+    actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.free_cache_engine=False \
+    reward_model.reward_manager=prime \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    custom_reward_function.overlong_buffer.enable=${enable_overlong_buffer} \
+    custom_reward_function.overlong_buffer.len=${overlong_buffer_len} \
+    custom_reward_function.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
+    custom_reward_function.train.path=verl/utils/reward_score/codev.py \
+    custom_reward_function.train.name=compute_score_wrapper \
+    custom_reward_function.train.continuous_reward.enable=False \
+    algorithm.kl_ctrl.kl_coef=0.0 \
+    trainer.critic_warmup=0 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name='codev' \
+    trainer.experiment_name='codev-7b-3.1kdata' \
+    trainer.n_gpus_per_node=$USER_GPUS_PER_NODE \
+    trainer.nnodes=$SLURM_JOB_NUM_NODES \
+    +trainer.val_before_train=False \
+    trainer.default_local_dir=$SAVE_DIR \
+    trainer.resume_mode=auto \
+    trainer.default_hdfs_dir=null \
+    trainer.save_freq=40 \
+    trainer.test_freq=20 \
+    trainer.total_epochs=100 "${@:1}"
+    # custom_reward_function.path=/nfs_global/S/zhuyaoyu/projects/dapo/verl/utils/reward_score/codev.py \
\ No newline at end of file
--- a/recipe/dapo/run_dapo_codev_7b_3.1k_base.sh
+++ b/recipe/dapo/run_dapo_codev_7b_3.1k_base.sh
+#!/bin/bash
+set -x
+set -euxo pipefail
+project_name='DAPO'
+exp_name='DAPO-Early-Qwen2.5-32B'
+adv_estimator=grpo
+kl_coef=0.0
+kl_loss_coef=0.0
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+enable_overlong_buffer=True
+overlong_buffer_len=$((1024 * 1))
+overlong_penalty_factor=1.0
+# An early version for DAPO
+enable_filter_groups=True
+train_prompt_bsz=128
+train_prompt_mini_bsz=64
+n_resp_per_prompt=16
+use_token_level_loss=True
+# Ray
+RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
+WORKING_DIR=${WORKING_DIR:-"${PWD}"}
+RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
+NNODES=${NNODES:-16}
+# Paths
+# Algorithm
+## Train
+max_prompt_length=$((1024 * 2))
+max_response_length=$((1024 * 16))
+## Validation
+val_top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
+# Performance Related Parameter
+sp_size=8
+use_dynamic_bsz=True
+actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
+infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
+offload=True
+gen_tp=4
+ppo_max_token_len_per_gpu=32768
+num_gpu=$(($USER_GPUS_PER_NODE * $SLURM_JOB_NUM_NODES))
+export VLLM_USE_V1=1
+echo "$WANDB_DIR"
+echo "$SAVE_DIR"
+echo "$WANDB_API_KEY"
+# Set default model path if not provided
+MODEL_PATH="/nfs_global/S/lvhanqi/LLaMA-Factory/saves/Qwen2.5-Coder-7B-Instruct-codev-r1-87k/full/sft_6epoch"
+# Train over a single node, 8 A100-80GB GPUs.
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=grpo \
+    data.train_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/3.1k_r1_filtered/train.parquet \
+    data.val_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/3.1k_r1_filtered/test.parquet \
+    data.train_batch_size=${train_prompt_bsz} \
+    data.val_batch_size=512 \
+    data.max_prompt_length=2048 \
+    data.max_response_length=$max_response_length \
+    algorithm.filter_groups.enable=${enable_filter_groups} \
+    algorithm.filter_groups.accelerate=False \
+    algorithm.filter_groups.max_num_gen_batches=999 \
+    algorithm.filter_groups.metric=acc \
+    data.gen_batch_size=$((($train_prompt_bsz * 3 / 2 + $num_gpu - 1) / $num_gpu * $num_gpu)) \
+    actor_rollout_ref.model.path=$MODEL_PATH \
+    +actor_rollout_ref.model.override_config.attention_dropout=0. \
+    +actor_rollout_ref.model.override_config.embd_pdrop=0. \
+    +actor_rollout_ref.model.override_config.resid_pdrop=0. \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    +actor_rollout_ref.model.use_liger=True \
+    actor_rollout_ref.actor.optim.lr=1e-6 \
+    actor_rollout_ref.actor.optim.weight_decay=0.0 \
+    actor_rollout_ref.actor.use_dynamic_bsz=True\
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${ppo_max_token_len_per_gpu} \
+    actor_rollout_ref.model.use_remove_padding=True \
+    actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
+    actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
+    actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
+    actor_rollout_ref.actor.use_kl_loss=False \
+    actor_rollout_ref.actor.kl_loss_coef=0.00 \
+    actor_rollout_ref.actor.kl_loss_type=low_var_kl \
+    actor_rollout_ref.actor.entropy_coeff=0 \
+    actor_rollout_ref.actor.grad_clip=0.5 \
+    actor_rollout_ref.actor.use_token_level_loss=${use_token_level_loss} \
+    actor_rollout_ref.model.enable_gradient_checkpointing=True \
+    actor_rollout_ref.actor.fsdp_config.param_offload=False \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
+    actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=$(($ppo_max_token_len_per_gpu*2)) \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
+    actor_rollout_ref.rollout.name=vllm \
+    actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
+    actor_rollout_ref.rollout.val_kwargs.n=4 \
+    actor_rollout_ref.rollout.temperature=1.0 \
+    actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
+    actor_rollout_ref.rollout.val_kwargs.do_sample=True \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
+    actor_rollout_ref.rollout.enforce_eager=False \
+    actor_rollout_ref.rollout.free_cache_engine=False \
+    reward_model.reward_manager=prime \
+    actor_rollout_ref.ref.fsdp_config.param_offload=True \
+    custom_reward_function.overlong_buffer.enable=${enable_overlong_buffer} \
+    custom_reward_function.overlong_buffer.len=${overlong_buffer_len} \
+    custom_reward_function.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
+    custom_reward_function.train.path=verl/utils/reward_score/codev.py \
+    custom_reward_function.train.name=compute_score_wrapper \
+    custom_reward_function.train.continuous_reward.enable=False \
+    algorithm.kl_ctrl.kl_coef=0.0 \
+    trainer.critic_warmup=0 \
+    trainer.logger=['console','wandb'] \
+    trainer.project_name='codev' \
+    trainer.experiment_name='codev-7b-3.1kdata' \
+    trainer.n_gpus_per_node=$USER_GPUS_PER_NODE \
+    trainer.nnodes=$SLURM_JOB_NUM_NODES \
+    +trainer.val_before_train=False \
+    trainer.default_local_dir=$SAVE_DIR \
+    trainer.resume_mode=auto \
+    trainer.default_hdfs_dir=null \
+    trainer.save_freq=40 \
+    trainer.test_freq=20 \
+    trainer.total_epochs=100 "${@:1}"
+    # custom_reward_function.path=/nfs_global/S/zhuyaoyu/projects/dapo/verl/utils/reward_score/codev.py \
\ No newline at end of file
--- a/verl/workers/reward_manager/dapo.py
+++ b/verl/workers/reward_manager/dapo.py
@@ -129,7 +129,7 @@ class DAPORewardManager:
        if return_dict:
            return {
                "reward_tensor": reward_tensor,
-                "reward_extra_info": reward_extra_info,
+                "reward_extra_info": reward_extra_info,gb
            }
        else:
            return reward_tensor