Commit a0930e4b by Yaoyu Zhu

update templates for rented servers

parent ed46c1a3
### 运行脚本
- 我们自己slurm集群的启动脚本在`template/slurm`下面,租的手动配的服务器启动脚本在`template/manualMultiNodes`下面
- 具体跑RL实验的config在`recipe/dapo`里面
### 数据预处理
脚本是`examples/data_preprocess/codev.py``scripts/preprocess.sh`里面记录了对它的调用,包括原始数据和处理后数据的路径。
### 运行脚本
`recipe/dapo`里面
### 代码修改
- reward函数在`verl/utils/reward_score/codev.py`,由于加了`continuous_reward`之类的参数,因此在运行脚本里面使用了`custom_reward_function`的方式进行外部调用,并对以下文件进行了修改:
- 对verl文件夹下面的`main_ppo.py`进行了修改,接了个`functools.partial`传参数到`codev.py``compute_score`里面
......
#!/bin/bash
set -x
SLURM_JOB_NUM_NODES=2
USER_GPUS_PER_NODE=8
MASTER_ADDR=10.10.40.221
MASTER_PORT=56379
# DASHBOARD_PORT=8265
# Extract device names and merge them into a comma-separated string
THIS_UP_IB_DEV=$(ibdev2netdev | grep Up | grep ib | awk '{print $1}' | paste -sd ',' -)
export NCCL_IB_HCA=$THIS_UP_IB_DEV
#- Log infomation
#- Important setting!!!
## otherwise it will cause an error of insufficient RDMA resources:
ulimit -l unlimited
## otherwise it will result in an insufficient virtual memory size error, especially when loading LLM:
ulimit -v unlimited
ulimit -n 65535
ulimit -u 4125556
# module load cuda-cudnn/11.8-8.8.1
# export CUDA_HOME=/tools/cluster-software/cuda-cudnn/cuda-11.8.0-8.8.1
# which nvcc
# echo $CUDA_HOME
#- WARNING! DO NOT MODIFY your CUDA_VISIBLE_DEVICES
#- in `.bashrc`, `env.sh`, or your job script
echo "Node $SLURM_NODEID, LocalID $SLURM_LOCALID: Use GPU ${CUDA_VISIBLE_DEVICES}"
#- The CUDA_VISIBLE_DEVICES variable is assigned and specified by SLURM
##- Monitor
# The script continues executing other tasks while the following command will execute after a while
echo "Main program continues to run. Monitoring information will be exported after three hours."
#- Main program execution
##- virtualenv
# source xxxxx/activate
source ~/.bashrc
conda activate verl
echo "Python path after activation:"
python -c "import sys; print(sys.path)"
##- Job step TODO
# ray's default GCS(Global Control Store) port is 6379
# and default dashboard port is 8265
# need to set `"working_dir": "."` in --runtime-env-json, otherwise working_dir will set to ~(/home/S/your_name) by default
export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
export NCCL_DEBUG=INFO
export NCCL_TIMEOUT=120
export RAY_record_ref_creation_sites=1
export RAY_IGNORE_UNHANDLED_ERRORS=0
export HYDRA_FULL_ERROR=1
export PYTHONUNBUFFERED=TRUE
export VLLM_USE_V1=1
# 注意事项:NCCL要设置这些,GLOO同样要设置一个SOCKET_IFNAME(https://haihub.cloud.tencent.com/detail?imageId=tencent/taco-train/a800)
# export NCCL_SOCKET_IFNAME=eth0
# export NCCL_IB_GID_INDEX=3
# export NCCL_IB_DISABLE=0
# export NCCL_NET_GDR_LEVEL=2
# export NCCL_IB_QPS_PER_CONNECTION=4
# export NCCL_IB_TC=160
# export NCCL_IB_TIMEOUT=22
# export GLOO_SOCKET_IFNAME=eth0
# export TP_SOCKET_IFNAME=eth0
# echo "GLOO_SOCKET_IFNAME is $GLOO_SOCKET_IFNAME!"
export RAY_TEMP_DIR="/tmp/ray_$SLURM_JOBID"
echo "RAY TEMP DIR is $RAY_TEMP_DIR"
export CURR_DIR=$(realpath .)
USER=$(whoami)
# export TORCH_EXTENSIONS_DIR="/workspace/S/$USER/.cache/torch_extensions"
# export HF_HOME="/workspace/S/$USER/.cache"
# export PIP_TOOLS_CACHE_DIR="/workspace/S/$USER/.cache/pip-tools"
# export TRITON_CACHE_DIR="/workspace/S/$USER/.triton/autotune"
export WANDB_API_KEY='0a72cf472255879d3bad4939d3b39506e4a8573b'
wandb login $WANDB_API_KEY
export WANDB_MODE=offline
echo "USER GPUS PER NODE IS $USER_GPUS_PER_NODE"
ray stop --force
DASHBOARD_PORT=$(($MASTER_PORT-10000))
DAL_PORT=$(($MASTER_PORT-20000))
RCS_PORT=$(($MASTER_PORT-30000))
RS_PORT=$(($MASTER_PORT-5000))
NM_PORT=$(($MASTER_PORT-15000))
OM_PORT=$(($MASTER_PORT-25000))
NODE_IP=$(ifconfig | grep -A 1 "bond0.45" | grep "inet" | awk '{print $2}')
if [ "$NODE_IP" == "$MASTER_ADDR" ]; then
# launch the master node of ray in container
ray start --head --node-ip-address $MASTER_ADDR --port $MASTER_PORT --redis-shard-ports $RS_PORT --node-manager-port $NM_PORT --object-manager-port $OM_PORT --dashboard-port $DASHBOARD_PORT --dashboard-agent-listen-port $DAL_PORT --ray-client-server-port $RCS_PORT --num-gpus $USER_GPUS_PER_NODE --temp-dir=$RAY_TEMP_DIR # https://github.com/OpenRLHF/OpenRLHF/issues/339
fi
# sleep 99999
export RAY_START_TIMEOUT=180
# wait for master node
timeout $RAY_START_TIMEOUT bash -c "while ! nc -z $MASTER_ADDR ${MASTER_PORT}; do sleep 2; done"
if [ $? -ne 0 ]; then
echo "Ray start on master node time out!!!"
ray stop --force
fi
if [ "$NODE_IP" != "$MASTER_ADDR" ]; then
# # if you want to launch ray on more nodes, use
ray start --address $MASTER_ADDR:$MASTER_PORT --num-gpus $USER_GPUS_PER_NODE --temp-dir=$RAY_TEMP_DIR
fi
# wait for other nodes
timeout $RAY_START_TIMEOUT bash -c "while [ \$(ray status | grep -c 'node_') -lt \$SLURM_JOB_NUM_NODES ]; do sleep 2; done"
if [ $? -ne 0 ]; then
echo "Timeout waiting for worker nodes!"
ray stop --force
fi
echo "All worker nodes are ready!"
ray status
exit 0
# only need to submit job on the master node,
# and submitting on other nodes will cause network errors
if [ "$NODE_IP" == "$MASTER_ADDR" ]; then
ray list nodes
SCRIPT_TO_RUN="$CURR_DIR/recipe/dapo/blockelite/run_dapo_codev_7b_3.1k.sh"
export SAVE_DIR="$CURR_DIR/results/codev_7b_dapo_3.1kdata"
# SCRIPT_TO_RUN="$CURR_DIR/recipe/dapo/blockelite/dapo_7b_test.sh"
# export SAVE_DIR="$CURR_DIR/results/dapo_7b_test"
mkdir -p $SAVE_DIR
cp $SCRIPT_TO_RUN $SAVE_DIR
# copy_log_and_plot() {
# sleep 3m
# while true; do
# python $CURR_DIR/plot_and_analyze/plot.py --folder $SAVE_DIR
# sleep 3m # 每隔3分钟执行一次,你可以根据需要调整时间
# done
# }
# copy_log_and_plot &
# COPY_PID=$!
RUNTIME_ENV=$(jq -n --arg save_dir "$SAVE_DIR" --arg gpu_per_node "$USER_GPUS_PER_NODE" --arg nnode "$SLURM_JOB_NUM_NODES" '{
"pip": ["ray"],
"working_dir": ".",
"excludes": ["ckpt/", "xxx/", "ret_one/", "data/", "results/", ".git/"],
"disable_caching": true,
"env_vars": {"SAVE_DIR": $save_dir,
"WANDB_DIR": $save_dir,
# "NCCL_SOCKET_IFNAME": "ibp70s0",
# "NCCL_IB_GID_INDEX": "3",
# "NCCL_IB_DISABLE": "0",
# "NCCL_NET_GDR_LEVEL": "2",
# "NCCL_IB_QPS_PER_CONNECTION": "4",
# "NCCL_IB_TC": "160",
# "NCCL_IB_TIMEOUT": "22",
"GLOO_SOCKET_IFNAME": "bond0.45",
"TP_SOCKET_IFNAME": "bond0.45",
"USER_GPUS_PER_NODE":$gpu_per_node, "SLURM_JOB_NUM_NODES":$nnode,
}
}')
ray job submit --address="http://127.0.0.1:$DASHBOARD_PORT" --runtime-env-json="$RUNTIME_ENV" -- bash $SCRIPT_TO_RUN 1>"$SAVE_DIR/run.log" 2>"$SAVE_DIR/run.err"
kill $COPY_PID
python $CURR_DIR/plot_and_analyze/plot.py --folder $SAVE_DIR
# sleep 48h
mkdir -p tmp/ray_$USER
chmod 777 ../tmp
cp -rfL $RAY_TEMP_DIR/session_latest tmp/ray_$USER/
ray stop --force
else
# echo "Worker node $SLURM_PROCID is waiting for head node to finish"
# Function to check connection to master node
check_connection() {
timeout 60 bash -c "while ! nc -z $MASTER_ADDR ${MASTER_PORT}; do sleep 5; done"
return $?
}
while true; do
if ! check_connection; then
echo "Connection to master node lost. Exiting worker node."
break
fi
sleep 60 # Check every 60 seconds
done
ray stop --force
fi
#- End
slurm-gpu-atop-log-stats $SLURM_JOB_ID $CUDA_VISIBLE_DEVICES
echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
# This will overwrite any existing atop logs from previous runs.
# WARNING: If your program times out or is terminated by scancel,
# the above script part might not execute correctly.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment