Commit 014a39f4 by Yaoyu Zhu

update codev dataset and rl config (use_liger)

parent 31ee9176
...@@ -91,7 +91,7 @@ if __name__ == '__main__': ...@@ -91,7 +91,7 @@ if __name__ == '__main__':
parser.add_argument('--train_size', type=int, default=15000) parser.add_argument('--train_size', type=int, default=15000)
parser.add_argument('--test_size', type=int, default=984) parser.add_argument('--test_size', type=int, default=984)
parser.add_argument('--save_jsonl', action='store_true', help='Save dataset as jsonl files') parser.add_argument('--save_jsonl', action='store_true', help='Save dataset as jsonl files')
parser.add_argument('--double_gt', action='store_true', help='View r1 code as well as original ground truth as ground truth') parser.add_argument('--gt', type=str, default=['gt'], choices=['gt', 'r1', 'double'], help='Choose ground_truth or r1 response or both as ground truth')
# continuous_reward is moved to training cfg # continuous_reward is moved to training cfg
# parser.add_argument('--continuous_reward', action='store_true', help='Save dataset as jsonl files') # parser.add_argument('--continuous_reward', action='store_true', help='Save dataset as jsonl files')
# parser.add_argument('--template_type', type=str, default='base') # parser.add_argument('--template_type', type=str, default='base')
...@@ -144,8 +144,10 @@ if __name__ == '__main__': ...@@ -144,8 +144,10 @@ if __name__ == '__main__':
question = make_question(example["question"]) question = make_question(example["question"])
# if args.continuous_reward: # if args.continuous_reward:
# ground_truth = {"answer": ground_truth, "reward_mode": "continuous"} # ground_truth = {"answer": ground_truth, "reward_mode": "continuous"}
if args.double_gt: if args.gt == 'both':
ground_truth = {"answer": ground_truth, "r1_answer": extract_verilog(example["r1_response"]["content"])} ground_truth = {"answer": ground_truth, "r1_answer": extract_verilog(example["r1_response"]["content"])}
elif args.gt == 'r1':
ground_truth = extract_verilog(example["r1_response"]["content"])
# pprint(ground_truth) # pprint(ground_truth)
# exit(0) # exit(0)
......
...@@ -19,9 +19,8 @@ overlong_penalty_factor=1.0 ...@@ -19,9 +19,8 @@ overlong_penalty_factor=1.0
# An early version for DAPO # An early version for DAPO
enable_filter_groups=True enable_filter_groups=True
gen_prompt_bsz=512 # NOTE: no filtering here train_prompt_bsz=128
train_prompt_bsz=512 train_prompt_mini_bsz=64
train_prompt_mini_bsz=32
n_resp_per_prompt=16 n_resp_per_prompt=16
use_token_level_loss=True use_token_level_loss=True
...@@ -46,7 +45,8 @@ actor_ppo_max_token_len=$((max_prompt_length + max_response_length)) ...@@ -46,7 +45,8 @@ actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
infer_ppo_max_token_len=$((max_prompt_length + max_response_length)) infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
offload=True offload=True
gen_tp=4 gen_tp=4
ppo_max_token_len_per_gpu=32768
num_gpu=$(($USER_GPUS_PER_NODE * $SLURM_JOB_NUM_NODES))
...@@ -65,13 +65,14 @@ python3 -m verl.trainer.main_ppo \ ...@@ -65,13 +65,14 @@ python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \ algorithm.adv_estimator=grpo \
data.train_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/err_l0.2_16k_r1_filtered/train.parquet \ data.train_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/err_l0.2_16k_r1_filtered/train.parquet \
data.val_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/err_l0.2_16k_r1_filtered/test.parquet \ data.val_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/err_l0.2_16k_r1_filtered/test.parquet \
data.train_batch_size=128 \ data.train_batch_size=${train_prompt_bsz} \
data.val_batch_size=512 \ data.val_batch_size=512 \
data.max_prompt_length=2048 \ data.max_prompt_length=2048 \
data.max_response_length=16384 \ data.max_response_length=16384 \
algorithm.filter_groups.enable=${enable_filter_groups} \ algorithm.filter_groups.enable=${enable_filter_groups} \
algorithm.filter_groups.max_num_gen_batches=999 \ algorithm.filter_groups.max_num_gen_batches=999 \
algorithm.filter_groups.metric=acc \ algorithm.filter_groups.metric=acc \
data.gen_batch_size=$((($train_prompt_bsz * 4 / 3 + $num_gpu - 1) / $num_gpu * $num_gpu)) \
actor_rollout_ref.model.path=$MODEL_PATH \ actor_rollout_ref.model.path=$MODEL_PATH \
+actor_rollout_ref.model.override_config.attention_dropout=0. \ +actor_rollout_ref.model.override_config.attention_dropout=0. \
+actor_rollout_ref.model.override_config.embd_pdrop=0. \ +actor_rollout_ref.model.override_config.embd_pdrop=0. \
...@@ -97,7 +98,7 @@ python3 -m verl.trainer.main_ppo \ ...@@ -97,7 +98,7 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=32768 \ actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=32768 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \ actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.n=16 \ actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
actor_rollout_ref.rollout.val_kwargs.n=2 \ actor_rollout_ref.rollout.val_kwargs.n=2 \
actor_rollout_ref.rollout.temperature=1.0 \ actor_rollout_ref.rollout.temperature=1.0 \
actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \ actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
...@@ -110,11 +111,11 @@ python3 -m verl.trainer.main_ppo \ ...@@ -110,11 +111,11 @@ python3 -m verl.trainer.main_ppo \
custom_reward_function.overlong_buffer.enable=${enable_overlong_buffer} \ custom_reward_function.overlong_buffer.enable=${enable_overlong_buffer} \
custom_reward_function.overlong_buffer.len=${overlong_buffer_len} \ custom_reward_function.overlong_buffer.len=${overlong_buffer_len} \
custom_reward_function.overlong_buffer.penalty_factor=${overlong_penalty_factor} \ custom_reward_function.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
custom_reward_function.path=verl/utils/reward_score/codev.py \ custom_reward_function.train.path=verl/utils/reward_score/codev.py \
custom_reward_function.name=compute_score_wrapper \ custom_reward_function.train.name=compute_score_wrapper \
custom_reward_function.continuous_reward.enable=True \ custom_reward_function.train.continuous_reward.enable=True \
custom_reward_function.continuous_reward.err_threshold=0.2 \ custom_reward_function.train.continuous_reward.err_threshold=0.2 \
custom_reward_function.continuous_reward.reward_mapping='threshold' \ custom_reward_function.train.continuous_reward.reward_mapping='threshold' \
algorithm.kl_ctrl.kl_coef=0.0 \ algorithm.kl_ctrl.kl_coef=0.0 \
trainer.critic_warmup=0 \ trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \ trainer.logger=['console','wandb'] \
......
#!/bin/bash
set -x
set -euxo pipefail
project_name='DAPO'
exp_name='DAPO-Early-Qwen2.5-32B'
adv_estimator=grpo
kl_coef=0.0
kl_loss_coef=0.0
clip_ratio_low=0.2
clip_ratio_high=0.28
enable_overlong_buffer=True
overlong_buffer_len=$((1024 * 1))
overlong_penalty_factor=1.0
# An early version for DAPO
enable_filter_groups=True
train_prompt_bsz=128
train_prompt_mini_bsz=64
n_resp_per_prompt=16
use_token_level_loss=True
# Ray
RAY_ADDRESS=${RAY_ADDRESS:-"http://localhost:8265"}
WORKING_DIR=${WORKING_DIR:-"${PWD}"}
RUNTIME_ENV=${RUNTIME_ENV:-"${WORKING_DIR}/verl/trainer/runtime_env.yaml"}
NNODES=${NNODES:-16}
# Paths
# Algorithm
## Train
max_prompt_length=$((1024 * 2))
max_response_length=$((1024 * 20))
## Validation
val_top_k=-1 # 0 for HF rollout, -1 for vLLM rollout
# Performance Related Parameter
sp_size=8
use_dynamic_bsz=True
actor_ppo_max_token_len=$((max_prompt_length + max_response_length))
infer_ppo_max_token_len=$((max_prompt_length + max_response_length))
offload=True
gen_tp=4
ppo_max_token_len_per_gpu=32768
num_gpu=$(($USER_GPUS_PER_NODE * $SLURM_JOB_NUM_NODES))
export VLLM_USE_V1=1
echo "$WANDB_DIR"
echo "$SAVE_DIR"
echo "$WANDB_API_KEY"
# Set default model path if not provided
MODEL_PATH="/nfs_global/S/lvhanqi/LLaMA-Factory/saves/Qwen2.5-Coder-7B-Instruct-codev-r1-87k/full/sft_6epoch"
# Train over a single node, 8 A100-80GB GPUs.
python3 -m verl.trainer.main_ppo \
algorithm.adv_estimator=grpo \
data.train_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/qwen7b32b_filter_gt_r1_error_rate_e_0.5_7.4k/train.parquet \
data.val_files=/nfs_global/S/zhuyaoyu/projects/verl/data/codev/v1/qwen7b32b_filter_gt_r1_error_rate_e_0.5_7.4k/test.parquet \
data.train_batch_size=${train_prompt_bsz} \
data.val_batch_size=512 \
data.max_prompt_length=2048 \
data.max_response_length=16384 \
algorithm.filter_groups.enable=${enable_filter_groups} \
algorithm.filter_groups.max_num_gen_batches=999 \
algorithm.filter_groups.metric=acc \
data.gen_batch_size=$((($train_prompt_bsz * 4 / 3 + $num_gpu - 1) / $num_gpu * $num_gpu)) \
actor_rollout_ref.model.path=$MODEL_PATH \
+actor_rollout_ref.model.override_config.attention_dropout=0. \
+actor_rollout_ref.model.override_config.embd_pdrop=0. \
+actor_rollout_ref.model.override_config.resid_pdrop=0. \
+actor_rollout_ref.model.use_liger=True \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.optim.lr=1e-6 \
actor_rollout_ref.actor.optim.weight_decay=0.0 \
actor_rollout_ref.actor.use_dynamic_bsz=True\
actor_rollout_ref.actor.ppo_max_token_len_per_gpu=${ppo_max_token_len_per_gpu} \
actor_rollout_ref.model.use_remove_padding=True \
actor_rollout_ref.actor.clip_ratio_low=${clip_ratio_low} \
actor_rollout_ref.actor.clip_ratio_high=${clip_ratio_high} \
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
actor_rollout_ref.actor.use_kl_loss=True \
actor_rollout_ref.actor.kl_loss_coef=0.00 \
actor_rollout_ref.actor.kl_loss_type=low_var_kl \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.grad_clip=0.5 \
actor_rollout_ref.actor.use_token_level_loss=${use_token_level_loss} \
actor_rollout_ref.model.enable_gradient_checkpointing=True \
actor_rollout_ref.actor.fsdp_config.param_offload=False \
actor_rollout_ref.actor.fsdp_config.optimizer_offload=False \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=$(($ppo_max_token_len_per_gpu*2)) \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=vllm \
actor_rollout_ref.rollout.n=${n_resp_per_prompt} \
actor_rollout_ref.rollout.val_kwargs.n=4 \
actor_rollout_ref.rollout.temperature=1.0 \
actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
actor_rollout_ref.rollout.val_kwargs.do_sample=True \
actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
actor_rollout_ref.rollout.enforce_eager=False \
actor_rollout_ref.rollout.free_cache_engine=False \
reward_model.reward_manager=prime \
actor_rollout_ref.ref.fsdp_config.param_offload=True \
custom_reward_function.overlong_buffer.enable=${enable_overlong_buffer} \
custom_reward_function.overlong_buffer.len=${overlong_buffer_len} \
custom_reward_function.overlong_buffer.penalty_factor=${overlong_penalty_factor} \
custom_reward_function.train.path=verl/utils/reward_score/codev.py \
custom_reward_function.train.name=compute_score_wrapper \
custom_reward_function.train.continuous_reward.enable=False \
algorithm.kl_ctrl.kl_coef=0.0 \
trainer.critic_warmup=0 \
trainer.logger=['console','wandb'] \
trainer.project_name='codev' \
trainer.experiment_name='codev-7b-7.4kdata-r1-gt' \
trainer.n_gpus_per_node=$USER_GPUS_PER_NODE \
trainer.nnodes=$SLURM_JOB_NUM_NODES \
+trainer.val_before_train=False \
trainer.default_local_dir=$SAVE_DIR \
trainer.resume_mode=auto \
trainer.default_hdfs_dir=null \
trainer.save_freq=20 \
trainer.test_freq=20 \
trainer.total_epochs=100 "${@:1}"
# custom_reward_function.path=/nfs_global/S/zhuyaoyu/projects/dapo/verl/utils/reward_score/codev.py \
\ No newline at end of file
...@@ -8,4 +8,7 @@ ...@@ -8,4 +8,7 @@
# python examples/data_preprocess/codev.py --data_path /nfs_global/S/lvhanqi/codev_data/sft_model_filter_error_rate_l_0.2_from_87k_and_decontamination_qwen_32b_correct_1234.jsonl --local_dir data/codev/v1/err_l0.2_28k_r1_qwen_filtered --train_size 28161 --test_size 500 # python examples/data_preprocess/codev.py --data_path /nfs_global/S/lvhanqi/codev_data/sft_model_filter_error_rate_l_0.2_from_87k_and_decontamination_qwen_32b_correct_1234.jsonl --local_dir data/codev/v1/err_l0.2_28k_r1_qwen_filtered --train_size 28161 --test_size 500
# python examples/data_preprocess/codev.py --data_path /nfs_global/S/lvhanqi/codev_data/error_rate_l_0.2_from_87k.jsonl --local_dir data/codev/v1/err_l0.2_20k_r1 --train_size 20067 --test_size 500 # python examples/data_preprocess/codev.py --data_path /nfs_global/S/lvhanqi/codev_data/error_rate_l_0.2_from_87k.jsonl --local_dir data/codev/v1/err_l0.2_20k_r1 --train_size 20067 --test_size 500
# python examples/data_preprocess/codev.py --data_path /nfs_global/S/lvhanqi/codev_data/sft_model_filter_error_rate_l_0.2_from_87k.jsonl --local_dir data/codev/v1/err_l0.2_16k_r1_filtered --train_size 16364 --test_size 300 # python examples/data_preprocess/codev.py --data_path /nfs_global/S/lvhanqi/codev_data/sft_model_filter_error_rate_l_0.2_from_87k.jsonl --local_dir data/codev/v1/err_l0.2_16k_r1_filtered --train_size 16364 --test_size 300
python examples/data_preprocess/codev.py --data_path /nfs_global/S/lvhanqi/codev_data/sft_model_filter_error_rate_l_0.2_from_87k.jsonl --local_dir data/codev/v1/err_l0.2_16k_r1_filtered_double_gt --double_gt --train_size 16364 --test_size 300 # python examples/data_preprocess/codev.py --data_path /nfs_global/S/lvhanqi/codev_data/sft_model_filter_error_rate_l_0.2_from_87k.jsonl --local_dir data/codev/v1/err_l0.2_16k_r1_filtered_double_gt --gt double --train_size 16364 --test_size 300
# python examples/data_preprocess/codev.py --data_path /nfs_global/S/lvhanqi/codev_data/sft_model_qwen7b32b_filter_gt_r1_error_rate_e_0.5_from_87k.jsonl --local_dir data/codev/v1/qwen7b32b_filter_gt_r1_error_rate_e_0.5_7.4k --gt r1 --train_size 7204 --test_size 200
python examples/data_preprocess/codev.py --data_path /nfs_global/S/lvhanqi/codev_data/sft_model_87k_correct1234_filter_qwen7b32b_data.jsonl --local_dir data/codev/v1/qwen7b32b_filter_gt_r1_14k --gt r1 --train_size 14654 --test_size 300
# python examples/data_preprocess/codev.py --data_path /nfs_global/S/lvhanqi/codev_data/sft_model_87k_correct12345.jsonl --local_dir data/codev/v1/qwen7b32b_filter_gt_r1_14k --gt r1 --train_size 14654 --test_size 3
\ No newline at end of file
...@@ -116,9 +116,9 @@ if __name__ == "__main__": ...@@ -116,9 +116,9 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Submit a Slurm job with specified parameters.") parser = argparse.ArgumentParser(description="Submit a Slurm job with specified parameters.")
# 添加命令行参数 # 添加命令行参数
parser.add_argument("--node_count", type=int, default=1, help="Number of nodes required.") parser.add_argument("--node_count", type=int, default=2, help="Number of nodes required.")
parser.add_argument("--gpus_per_node", type=int, default=8, help="Number of GPUs per node (4 or 8).") parser.add_argument("--gpus_per_node", type=int, default=8, help="Number of GPUs per node (4 or 8).")
parser.add_argument("--node_type", type=str, default="r8l40s", help="Node type (r8l40/r8l40s/r8a100).") parser.add_argument("--node_type", type=str, default="r8l40", help="Node type (r8l40/r8l40s/r8a100).")
parser.add_argument("--partition", type=str, default=None, help="Partition name. (r8nv-gpu-dedicated needs to be specified)") parser.add_argument("--partition", type=str, default=None, help="Partition name. (r8nv-gpu-dedicated needs to be specified)")
parser.add_argument("--qos", type=str, default=None, help="QOS type. (gpu-long needs to be specified)") parser.add_argument("--qos", type=str, default=None, help="QOS type. (gpu-long needs to be specified)")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment