Commit c99c9d43 by ZhangXiaoyun

slurm fix

parent 69ea6d70
*.out
*.log
*.err
*.json
*.jsonl
.tmp/
\ No newline at end of file
......@@ -67,6 +67,14 @@ echo "Use GPU ${CUDA_VISIBLE_DEVICES}" # which gpus
# export CUDA_DEVICE_ORDER="PCI_BUS_ID"
#- Important setting!!!
## otherwise it will cause an error of insufficient RDMA resources:
ulimit -l unlimited
## otherwise it will result in an insufficient virtual memory size error, especially when loading LLM:
ulimit -v unlimited
ulimit -n 65535
ulimit -u 4125556
#- Job step
# sleep 30h
cd /nfs_global/S/zhangxiaoyun/prm/openr
......
......@@ -6,3 +6,44 @@ Currently Loaded Modulefiles:
1) cluster-tools/v1.0 4) slurm-tools/v1.0 7) cuda-cudnn/11.8-8.8.1
2) git/2.31.1 5) cmake/3.21.7
3) python3/3.8.16 6) mpich/3.2.1
/workspace/S/zhangxiaoyun/miniconda3/envs/open_reasoner/lib/python3.10/site-packages/fuzzywuzzy/fuzz.py:11: UserWarning: Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning
warnings.warn('Using slow pure-python SequenceMatcher. Install python-Levenshtein to remove this warning')
2025-02-27 14:48:50,096 INFO worker.py:1816 -- Started a local Ray instance.
[2025-02-27 14:48:52,314 C 974958 974958] shared_memory.cc:32: mmap failed
*** StackTrace Information ***
/workspace/S/zhangxiaoyun/miniconda3/envs/open_reasoner/lib/python3.10/site-packages/ray/_raylet.so(+0x114367a) [0x7ff05c96867a] ray::operator<<()
/workspace/S/zhangxiaoyun/miniconda3/envs/open_reasoner/lib/python3.10/site-packages/ray/_raylet.so(_ZN3ray6RayLogD1Ev+0x4d1) [0x7ff05c96ad01] ray::RayLog::~RayLog()
/workspace/S/zhangxiaoyun/miniconda3/envs/open_reasoner/lib/python3.10/site-packages/ray/_raylet.so(+0x9b9828) [0x7ff05c1de828] plasma::ClientMmapTableEntry::ClientMmapTableEntry()
/workspace/S/zhangxiaoyun/miniconda3/envs/open_reasoner/lib/python3.10/site-packages/ray/_raylet.so(+0x9b4810) [0x7ff05c1d9810] plasma::PlasmaClient::Impl::GetStoreFdAndMmap()
/workspace/S/zhangxiaoyun/miniconda3/envs/open_reasoner/lib/python3.10/site-packages/ray/_raylet.so(_ZN6plasma12PlasmaClient4Impl17HandleCreateReplyERKN3ray8ObjectIDEbPKhPmPSt10shared_ptrINS2_6BufferEE+0x3bf) [0x7ff05c1da6af] plasma::PlasmaClient::Impl::HandleCreateReply()
/workspace/S/zhangxiaoyun/miniconda3/envs/open_reasoner/lib/python3.10/site-packages/ray/_raylet.so(_ZN6plasma12PlasmaClient4Impl22CreateAndSpillIfNeededERKN3ray8ObjectIDERKNS2_3rpc7AddressEblPKhlPSt10shared_ptrINS2_6BufferEENS_7flatbuf12ObjectSourceEi+0x22b) [0x7ff05c1db12b] plasma::PlasmaClient::Impl::CreateAndSpillIfNeeded()
/workspace/S/zhangxiaoyun/miniconda3/envs/open_reasoner/lib/python3.10/site-packages/ray/_raylet.so(_ZN6plasma12PlasmaClient22CreateAndSpillIfNeededERKN3ray8ObjectIDERKNS1_3rpc7AddressEblPKhlPSt10shared_ptrINS1_6BufferEENS_7flatbuf12ObjectSourceEi+0x2b) [0x7ff05c1db75b] plasma::PlasmaClient::CreateAndSpillIfNeeded()
/workspace/S/zhangxiaoyun/miniconda3/envs/open_reasoner/lib/python3.10/site-packages/ray/_raylet.so(_ZN3ray4core29CoreWorkerPlasmaStoreProvider6CreateERKSt10shared_ptrINS_6BufferEEmRKNS_8ObjectIDERKNS_3rpc7AddressEPS4_bb+0xe6) [0x7ff05c16ef36] ray::core::CoreWorkerPlasmaStoreProvider::Create()
/workspace/S/zhangxiaoyun/miniconda3/envs/open_reasoner/lib/python3.10/site-packages/ray/_raylet.so(_ZN3ray4core29CoreWorkerPlasmaStoreProvider11WarmupStoreEv+0x9b) [0x7ff05c1714cb] ray::core::CoreWorkerPlasmaStoreProvider::WarmupStore()
/workspace/S/zhangxiaoyun/miniconda3/envs/open_reasoner/lib/python3.10/site-packages/ray/_raylet.so(_ZN3ray4core29CoreWorkerPlasmaStoreProviderC2ERKSsSt10shared_ptrINS_6raylet12RayletClientEES4_INS0_16ReferenceCounterEESt8functionIFNS_6StatusEvEEbSA_IFSsvEE+0x36b) [0x7ff05c171bab] ray::core::CoreWorkerPlasmaStoreProvider::CoreWorkerPlasmaStoreProvider()
/workspace/S/zhangxiaoyun/miniconda3/envs/open_reasoner/lib/python3.10/site-packages/ray/_raylet.so(_ZN3ray4core10CoreWorkerC1ERKNS0_17CoreWorkerOptionsERKNS_8WorkerIDE+0x205b) [0x7ff05c0fc5eb] ray::core::CoreWorker::CoreWorker()
/workspace/S/zhangxiaoyun/miniconda3/envs/open_reasoner/lib/python3.10/site-packages/ray/_raylet.so(_ZN3ray4core21CoreWorkerProcessImplC2ERKNS0_17CoreWorkerOptionsE+0x566) [0x7ff05c10ed16] ray::core::CoreWorkerProcessImpl::CoreWorkerProcessImpl()
/workspace/S/zhangxiaoyun/miniconda3/envs/open_reasoner/lib/python3.10/site-packages/ray/_raylet.so(_ZN3ray4core17CoreWorkerProcess10InitializeERKNS0_17CoreWorkerOptionsE+0x98) [0x7ff05c10fe68] ray::core::CoreWorkerProcess::Initialize()
/workspace/S/zhangxiaoyun/miniconda3/envs/open_reasoner/lib/python3.10/site-packages/ray/_raylet.so(+0x74433f) [0x7ff05bf6933f] __pyx_pw_3ray_7_raylet_10CoreWorker_1__cinit__()
/workspace/S/zhangxiaoyun/miniconda3/envs/open_reasoner/lib/python3.10/site-packages/ray/_raylet.so(+0x7455e9) [0x7ff05bf6a5e9] __pyx_tp_new_3ray_7_raylet_CoreWorker()
python(_PyObject_MakeTpCall+0x193) [0x4f64b3] _PyObject_MakeTpCall
python(_PyEval_EvalFrameDefault+0x53ee) [0x4f275e] _PyEval_EvalFrameDefault
python(_PyFunction_Vectorcall+0x6f) [0x4fcf3f] _PyFunction_Vectorcall
python(_PyEval_EvalFrameDefault+0x13b2) [0x4ee722] _PyEval_EvalFrameDefault
python(_PyFunction_Vectorcall+0x6f) [0x4fcf3f] _PyFunction_Vectorcall
python(PyObject_Call+0xb8) [0x508cd8] PyObject_Call
python(_PyEval_EvalFrameDefault+0x2de4) [0x4f0154] _PyEval_EvalFrameDefault
python(_PyFunction_Vectorcall+0x6f) [0x4fcf3f] _PyFunction_Vectorcall
python(_PyEval_EvalFrameDefault+0x13b2) [0x4ee722] _PyEval_EvalFrameDefault
python() [0x5924f2] _PyEval_Vector
python(PyEval_EvalCode+0x87) [0x592437] PyEval_EvalCode
python() [0x5c3237] run_eval_code_obj
python() [0x5be380] run_mod
python() [0x4598d6] pyrun_file.cold
python(_PyRun_SimpleFileObject+0x19f) [0x5b890f] _PyRun_SimpleFileObject
python(_PyRun_AnyFileObject+0x43) [0x5b8673] _PyRun_AnyFileObject
python(Py_RunMain+0x38d) [0x5b542d] Py_RunMain
python(Py_BytesMain+0x39) [0x585609] Py_BytesMain
/lib64/libc.so.6(__libc_start_main+0xe5) [0x7ff1bb02dd85] __libc_start_main
python() [0x5854be]
Job start at 2025-02-27 13:10:47
Job start at 2025-02-27 14:46:56
Job run at:
Static hostname: localhost.localdomain
Transient hostname: r8a100-d01
......@@ -28,7 +28,7 @@ Disk quotas for user zhangxiaoyun (uid 6191):
############### /nfs_global
Disk quotas for user zhangxiaoyun (uid 6191):
Filesystem space quota limit grace files quota limit grace
/nfs_global 1594G 5120G 7168G 39469 5000k 10000k
/nfs_global 1594G 5120G 7168G 39496 5000k 10000k
############### /lustre
Disk quotas for usr zhangxiaoyun (uid 6191):
......@@ -36,7 +36,7 @@ Disk quotas for usr zhangxiaoyun (uid 6191):
/lustre 0k 8T 10T - 0 3000000 36000000 -
uid 6191 is using default block quota setting
uid 6191 is using default file quota setting
Thu Feb 27 13:10:48 2025
Thu Feb 27 14:46:57 2025
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15 Driver Version: 550.54.15 CUDA Version: 12.4 |
|-----------------------------------------+------------------------+----------------------+
......@@ -45,35 +45,35 @@ Thu Feb 27 13:10:48 2025
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA A100 80GB PCIe On | 00000000:35:00.0 Off | 0 |
| N/A 37C P0 57W / 300W | 0MiB / 81920MiB | 0% Default |
| N/A 36C P0 56W / 300W | 0MiB / 81920MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 1 NVIDIA A100 80GB PCIe On | 00000000:36:00.0 Off | 0 |
| N/A 40C P0 57W / 300W | 0MiB / 81920MiB | 0% Default |
| N/A 37C P0 56W / 300W | 0MiB / 81920MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 2 NVIDIA A100 80GB PCIe On | 00000000:39:00.0 Off | 0 |
| N/A 42C P0 57W / 300W | 0MiB / 81920MiB | 0% Default |
| N/A 37C P0 56W / 300W | 0MiB / 81920MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 3 NVIDIA A100 80GB PCIe On | 00000000:3D:00.0 Off | 0 |
| N/A 37C P0 56W / 300W | 0MiB / 81920MiB | 0% Default |
| N/A 34C P0 55W / 300W | 0MiB / 81920MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 4 NVIDIA A100 80GB PCIe On | 00000000:9C:00.0 Off | 0 |
| N/A 37C P0 56W / 300W | 0MiB / 81920MiB | 0% Default |
| N/A 34C P0 55W / 300W | 0MiB / 81920MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 5 NVIDIA A100 80GB PCIe On | 00000000:9D:00.0 Off | 0 |
| N/A 40C P0 58W / 300W | 0MiB / 81920MiB | 0% Default |
| N/A 37C P0 57W / 300W | 0MiB / 81920MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 6 NVIDIA A100 80GB PCIe On | 00000000:A0:00.0 Off | 0 |
| N/A 38C P0 55W / 300W | 0MiB / 81920MiB | 0% Default |
| N/A 34C P0 54W / 300W | 0MiB / 81920MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
| 7 NVIDIA A100 80GB PCIe On | 00000000:A4:00.0 Off | 0 |
| N/A 39C P0 56W / 300W | 0MiB / 81920MiB | 0% Default |
| N/A 35C P0 55W / 300W | 0MiB / 81920MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
......@@ -88,3 +88,4 @@ Use GPU 0,1,2,3,4,5,6,7
PYTHON_EXECUTABLE=/workspace/S/zhangxiaoyun/miniconda3/envs/open_reasoner/bin/python3
Wait 5 seconds ...
Starting workers
Job end at 2025-02-27 14:48:52
......@@ -65,6 +65,7 @@ if __name__ == "__main__":
parser.add_argument("--num_worker", type=int, default=32)
config = parser.parse_args()
ray.init(_temp_dir="/nfs_global/S/zhangxiaoyun/prm/openr/.tmp")
setup_seed(config.seed)
if config.local:
print("run in pure local mode for debug only")
......
set -e
export RAY_TEMP_DIR="/tmp/ray_$SLURM_JOBID"
echo "RAY TEMP DIR is $RAY_TEMP_DIR"
HOST_ADDR=0.0.0.0
CONTROLER_PORT=28777
WORKER_BASE_PORT=30010
......
export RAY_TEMP_DIR="/tmp/ray_$SLURM_JOBID"
echo "RAY TEMP DIR is $RAY_TEMP_DIR"
python reason/evaluation/evaluate.py \
--LM Qwen2.5-Math-1.5B-Instruct \
--RM Qwen2.5-Math-1.5B-Instruct_RM \
......@@ -9,7 +12,7 @@ python reason/evaluation/evaluate.py \
--tree_max_depth 50 \
--save_dir debug \
--method beam_search \
--num_worker 32 \
--num_worker 64 \
--controller_addr http://0.0.0.0:28777
# math-shepherd-mistral-7b-prm
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment