update templates for rented servers

a0930e4b · Yaoyu Zhu · ed46c1a3 · a0930e4b · a0930e4b
Commit a0930e4b authored Apr 21, 2025 by Yaoyu Zhu
Hide whitespace changes
Inline Side-by-side

Showing with 213 additions and 3 deletions

README.md
+4 -3

template/manualMultiNodes/train-multigpu.sh
+209 -0

No files found.
--- a/README.md
+++ b/README.md
+### 运行脚本
+- 我们自己slurm集群的启动脚本在`template/slurm`下面，租的手动配的服务器启动脚本在`template/manualMultiNodes`下面
+- 具体跑RL实验的config在`recipe/dapo`里面
+
 ### 数据预处理
 脚本是`examples/data_preprocess/codev.py`，`scripts/preprocess.sh`里面记录了对它的调用，包括原始数据和处理后数据的路径。

-### 运行脚本
-在`recipe/dapo`里面
-
 ### 代码修改
 - reward函数在`verl/utils/reward_score/codev.py`，由于加了`continuous_reward`之类的参数，因此在运行脚本里面使用了`custom_reward_function`的方式进行外部调用，并对以下文件进行了修改：
    - 对verl文件夹下面的`main_ppo.py`进行了修改，接了个`functools.partial`传参数到`codev.py`的`compute_score`里面

--- a/template/manualMultiNodes/train-multigpu.sh
+++ b/template/manualMultiNodes/train-multigpu.sh
+#!/bin/bash
+
+set -x
+
+SLURM_JOB_NUM_NODES=2
+USER_GPUS_PER_NODE=8
+MASTER_ADDR=10.10.40.221
+MASTER_PORT=56379
+# DASHBOARD_PORT=8265
+# Extract device names and merge them into a comma-separated string
+THIS_UP_IB_DEV=$(ibdev2netdev | grep Up | grep ib | awk '{print $1}' | paste -sd ',' -)
+export NCCL_IB_HCA=$THIS_UP_IB_DEV
+
+#- Log infomation
+
+#- Important setting!!!
+##  otherwise it will cause an error of insufficient RDMA resources:
+ulimit -l unlimited
+##  otherwise it will result in an insufficient virtual memory size error, especially when loading LLM:
+ulimit -v unlimited
+ulimit -n 65535
+ulimit -u 4125556
+
+# module load cuda-cudnn/11.8-8.8.1
+# export CUDA_HOME=/tools/cluster-software/cuda-cudnn/cuda-11.8.0-8.8.1
+# which nvcc
+# echo $CUDA_HOME
+
+#- WARNING! DO NOT MODIFY your CUDA_VISIBLE_DEVICES
+#- in `.bashrc`, `env.sh`, or your job script
+echo "Node $SLURM_NODEID, LocalID $SLURM_LOCALID: Use GPU ${CUDA_VISIBLE_DEVICES}"
+#- The CUDA_VISIBLE_DEVICES variable is assigned and specified by SLURM
+
+##- Monitor
+# The script continues executing other tasks while the following command will execute after a while
+echo "Main program continues to run. Monitoring information will be exported after three hours."
+
+#- Main program execution
+
+##- virtualenv
+# source xxxxx/activate
+source ~/.bashrc
+conda activate verl
+
+echo "Python path after activation:"
+python -c "import sys; print(sys.path)"
+
+##- Job step TODO
+
+# ray's default GCS(Global Control Store) port is 6379 
+# and default dashboard port is 8265
+# need to set `"working_dir": "."` in --runtime-env-json, otherwise working_dir will set to ~(/home/S/your_name) by default
+
+export TORCH_NCCL_ASYNC_ERROR_HANDLING=1
+export NCCL_DEBUG=INFO
+export NCCL_TIMEOUT=120
+export RAY_record_ref_creation_sites=1
+export RAY_IGNORE_UNHANDLED_ERRORS=0
+export HYDRA_FULL_ERROR=1
+export PYTHONUNBUFFERED=TRUE
+export VLLM_USE_V1=1
+
+# 注意事项：NCCL要设置这些，GLOO同样要设置一个SOCKET_IFNAME（https://haihub.cloud.tencent.com/detail?imageId=tencent/taco-train/a800）
+# export NCCL_SOCKET_IFNAME=eth0
+# export NCCL_IB_GID_INDEX=3
+# export NCCL_IB_DISABLE=0
+# export NCCL_NET_GDR_LEVEL=2
+# export NCCL_IB_QPS_PER_CONNECTION=4
+# export NCCL_IB_TC=160
+# export NCCL_IB_TIMEOUT=22
+
+# export GLOO_SOCKET_IFNAME=eth0
+# export TP_SOCKET_IFNAME=eth0
+# echo "GLOO_SOCKET_IFNAME is $GLOO_SOCKET_IFNAME!"
+
+export RAY_TEMP_DIR="/tmp/ray_$SLURM_JOBID"
+echo "RAY TEMP DIR is $RAY_TEMP_DIR"
+export CURR_DIR=$(realpath .)
+
+USER=$(whoami)
+# export TORCH_EXTENSIONS_DIR="/workspace/S/$USER/.cache/torch_extensions"
+# export HF_HOME="/workspace/S/$USER/.cache"
+# export PIP_TOOLS_CACHE_DIR="/workspace/S/$USER/.cache/pip-tools"
+# export TRITON_CACHE_DIR="/workspace/S/$USER/.triton/autotune"
+
+export WANDB_API_KEY='0a72cf472255879d3bad4939d3b39506e4a8573b'
+wandb login $WANDB_API_KEY
+export WANDB_MODE=offline
+
+echo "USER GPUS PER NODE IS $USER_GPUS_PER_NODE"
+ray stop --force
+
+DASHBOARD_PORT=$(($MASTER_PORT-10000))
+DAL_PORT=$(($MASTER_PORT-20000))
+RCS_PORT=$(($MASTER_PORT-30000))
+RS_PORT=$(($MASTER_PORT-5000))
+NM_PORT=$(($MASTER_PORT-15000))
+OM_PORT=$(($MASTER_PORT-25000))
+
+NODE_IP=$(ifconfig | grep -A 1 "bond0.45" | grep "inet" | awk '{print $2}')
+
+if [ "$NODE_IP" == "$MASTER_ADDR" ]; then
+    # launch the master node of ray in container
+    ray start --head --node-ip-address $MASTER_ADDR --port $MASTER_PORT --redis-shard-ports $RS_PORT --node-manager-port $NM_PORT --object-manager-port $OM_PORT --dashboard-port $DASHBOARD_PORT --dashboard-agent-listen-port $DAL_PORT --ray-client-server-port $RCS_PORT --num-gpus $USER_GPUS_PER_NODE --temp-dir=$RAY_TEMP_DIR # https://github.com/OpenRLHF/OpenRLHF/issues/339
+fi
+
+# sleep 99999
+export RAY_START_TIMEOUT=180
+# wait for master node
+timeout $RAY_START_TIMEOUT bash -c "while ! nc -z $MASTER_ADDR ${MASTER_PORT}; do sleep 2; done"
+if [ $? -ne 0 ]; then
+    echo "Ray start on master node time out!!!"
+    ray stop --force
+fi
+
+if [ "$NODE_IP" != "$MASTER_ADDR" ]; then
+    # # if you want to launch ray on more nodes, use
+    ray start --address $MASTER_ADDR:$MASTER_PORT --num-gpus $USER_GPUS_PER_NODE --temp-dir=$RAY_TEMP_DIR
+fi
+# wait for other nodes
+timeout $RAY_START_TIMEOUT bash -c "while [ \$(ray status | grep -c 'node_') -lt \$SLURM_JOB_NUM_NODES ]; do sleep 2; done"
+if [ $? -ne 0 ]; then
+    echo "Timeout waiting for worker nodes!"
+    ray stop --force
+fi
+
+echo "All worker nodes are ready!"
+ray status
+exit 0
+# only need to submit job on the master node, 
+# and submitting on other nodes will cause network errors
+if [ "$NODE_IP" == "$MASTER_ADDR" ]; then
+    ray list nodes
+
+    SCRIPT_TO_RUN="$CURR_DIR/recipe/dapo/blockelite/run_dapo_codev_7b_3.1k.sh"
+    export SAVE_DIR="$CURR_DIR/results/codev_7b_dapo_3.1kdata"
+
+    # SCRIPT_TO_RUN="$CURR_DIR/recipe/dapo/blockelite/dapo_7b_test.sh"
+    # export SAVE_DIR="$CURR_DIR/results/dapo_7b_test"
+
+    mkdir -p $SAVE_DIR
+    cp $SCRIPT_TO_RUN $SAVE_DIR
+
+    # copy_log_and_plot() {
+    #     sleep 3m
+    #     while true; do
+    #         python $CURR_DIR/plot_and_analyze/plot.py --folder $SAVE_DIR
+    #         sleep 3m  # 每隔3分钟执行一次，你可以根据需要调整时间
+    #     done
+    # }
+
+    # copy_log_and_plot &
+    # COPY_PID=$!
+
+    RUNTIME_ENV=$(jq -n --arg save_dir "$SAVE_DIR" --arg gpu_per_node "$USER_GPUS_PER_NODE" --arg nnode "$SLURM_JOB_NUM_NODES" '{
+            "pip": ["ray"],
+            "working_dir": ".",
+            "excludes": ["ckpt/", "xxx/", "ret_one/", "data/", "results/", ".git/"],
+            "disable_caching": true,
+            "env_vars": {"SAVE_DIR": $save_dir, 
+                         "WANDB_DIR": $save_dir,
+                        #  "NCCL_SOCKET_IFNAME": "ibp70s0",
+                        #  "NCCL_IB_GID_INDEX": "3",
+                        #  "NCCL_IB_DISABLE": "0",
+                        #  "NCCL_NET_GDR_LEVEL": "2",
+                        #  "NCCL_IB_QPS_PER_CONNECTION": "4",
+                        #  "NCCL_IB_TC": "160",
+                        #  "NCCL_IB_TIMEOUT": "22",
+                         "GLOO_SOCKET_IFNAME": "bond0.45",
+                         "TP_SOCKET_IFNAME": "bond0.45",
+                        "USER_GPUS_PER_NODE":$gpu_per_node, "SLURM_JOB_NUM_NODES":$nnode,
+                         }
+        }')
+    ray job submit --address="http://127.0.0.1:$DASHBOARD_PORT" --runtime-env-json="$RUNTIME_ENV" -- bash $SCRIPT_TO_RUN 1>"$SAVE_DIR/run.log" 2>"$SAVE_DIR/run.err"
+    
+    kill $COPY_PID
+    python $CURR_DIR/plot_and_analyze/plot.py --folder $SAVE_DIR
+    
+    # sleep 48h
+    mkdir -p tmp/ray_$USER
+    chmod 777 ../tmp
+    cp -rfL $RAY_TEMP_DIR/session_latest tmp/ray_$USER/
+    ray stop --force
+else
+    # echo "Worker node $SLURM_PROCID is waiting for head node to finish"
+    
+    # Function to check connection to master node
+    check_connection() {
+        timeout 60 bash -c "while ! nc -z $MASTER_ADDR ${MASTER_PORT}; do sleep 5; done"
+        return $?
+    }
+    
+    while true; do
+        if ! check_connection; then
+            echo "Connection to master node lost. Exiting worker node."
+            break
+        fi
+        sleep 60  # Check every 60 seconds
+    done
+    ray stop --force
+fi
+
+#- End
+slurm-gpu-atop-log-stats $SLURM_JOB_ID $CUDA_VISIBLE_DEVICES
+echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
+# This will overwrite any existing atop logs from previous runs.
+# WARNING: If your program times out or is terminated by scancel,
+#          the above script part might not execute correctly.
+