Merge branch 'swx' into 'master'

Swx See merge request !1

Merge branch 'swx' into 'master'
Swx See merge request !1
00cffb98 · Shi wenxuan · ca6feac2 · 2c7b8a6b · 00cffb98 · 00cffb98
Commit 00cffb98 authored Mar 04, 2025 by Shi wenxuan
184 changed files
--- a/.gitignore
+++ b/.gitignore
--- a/README.md
+++ b/README.md
--- a/analysis/README.md
+++ b/analysis/README.md
+### beam size关键路径
+prm/openr/reason/evaluation/evaluate.py line 201
+prm/openr/reason/evaluation/methods.py line 122
+prm/openr/reason/guided_search/tree.py line 434
+prm/openr/reason/guided_search/tree.py line 449
+prm/openr/reason/guided_search/tree.py line 461
\ No newline at end of file
--- a/openr/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/openr/.github/ISSUE_TEMPLATE/bug-report.yml
--- a/openr/.github/ISSUE_TEMPLATE/feature-request.yml
+++ b/openr/.github/ISSUE_TEMPLATE/feature-request.yml
--- a/openr/.gitignore
+++ b/openr/.gitignore
--- a/openr/CONTRIBUTING.md
+++ b/openr/CONTRIBUTING.md
--- a/openr/LICENSE
+++ b/openr/LICENSE
--- a/openr/README.md
+++ b/openr/README.md
--- a/openr/README_zh.md
+++ b/openr/README_zh.md
--- a/openr/benchmark/plots.md
+++ b/openr/benchmark/plots.md
--- a/openr/benchmark/tables.md
+++ b/openr/benchmark/tables.md
--- a/openr/config/__init__.py
+++ b/openr/config/__init__.py
--- a/openr/config/config_utils.py
+++ b/openr/config/config_utils.py
--- a/openr/data/README.md
+++ b/openr/data/README.md
--- a/openr/data/config.yaml
+++ b/openr/data/config.yaml
--- a/openr/data/extracted_problems_and_answers.json
+++ b/openr/data/extracted_problems_and_answers.json
--- a/openr/data/gen_data.py
+++ b/openr/data/gen_data.py
--- a/openr/data/model_utils.py
+++ b/openr/data/model_utils.py
--- a/openr/data/module.py
+++ b/openr/data/module.py
--- a/openr/data/omegaPRM_v2/llm_utils.py
+++ b/openr/data/omegaPRM_v2/llm_utils.py
--- a/openr/data/omegaPRM_v2/omegaprm.py
+++ b/openr/data/omegaPRM_v2/omegaprm.py
--- a/openr/data/omegaPRM_v2/process_json.py
+++ b/openr/data/omegaPRM_v2/process_json.py
--- a/openr/data/omegaPRM_v2/readme.md
+++ b/openr/data/omegaPRM_v2/readme.md
--- a/openr/data/omegaPRM_v2/run_omegaprm.py
+++ b/openr/data/omegaPRM_v2/run_omegaprm.py
--- a/openr/data/omegaPRM_v2/run_omegaprm_multi_gpu.sh
+++ b/openr/data/omegaPRM_v2/run_omegaprm_multi_gpu.sh
--- a/openr/distributed/utils.py
+++ b/openr/distributed/utils.py
--- a/openr/envs/MATH/__init__.py
+++ b/openr/envs/MATH/__init__.py
--- a/openr/envs/MATH/data.py
+++ b/openr/envs/MATH/data.py
--- a/openr/envs/MATH/dataset/test500.jsonl
+++ b/openr/envs/MATH/dataset/test500.jsonl
--- a/openr/envs/MATH/dataset/train.jsonl
+++ b/openr/envs/MATH/dataset/train.jsonl
--- a/openr/envs/MATH/env.py
+++ b/openr/envs/MATH/env.py
--- a/openr/envs/MATH/grader.py
+++ b/openr/envs/MATH/grader.py
--- a/openr/envs/MATH/parse_utils_qwen.py
+++ b/openr/envs/MATH/parse_utils_qwen.py
--- a/openr/envs/MATH/prompt.py
+++ b/openr/envs/MATH/prompt.py
--- a/openr/envs/MATH/verify_utils.py
+++ b/openr/envs/MATH/verify_utils.py
--- a/openr/envs/__init__.py
+++ b/openr/envs/__init__.py
--- a/openr/envs/base_env.py
+++ b/openr/envs/base_env.py
@@ -218,7 +218,7 @@ class CoTEnv(BaseEnv):
            processed_act = self.post_process_act(texts[i])
            if (
                len(processed_act) > 0
-                and processed_act not in text_list
+                # and processed_act not in text_list
                # only stop is valid, otherwise the output action is truncated actually
                and result.finish_reason[i] == "stop" 
            ):

--- a/openr/envs/rstar/README.md
+++ b/openr/envs/rstar/README.md
--- a/openr/envs/rstar/__init__.py
+++ b/openr/envs/rstar/__init__.py
--- a/openr/envs/rstar/data.py
+++ b/openr/envs/rstar/data.py
--- a/openr/envs/rstar/eval_src/Evaluator.py
+++ b/openr/envs/rstar/eval_src/Evaluator.py
--- a/openr/envs/rstar/eval_src/do_eval.py
+++ b/openr/envs/rstar/eval_src/do_eval.py
--- a/openr/envs/rstar/eval_src/toolkit_for_MATH/latex_answer_check.py
+++ b/openr/envs/rstar/eval_src/toolkit_for_MATH/latex_answer_check.py
--- a/openr/envs/rstar/eval_src/toolkit_for_MATH/metamath_utils.py
+++ b/openr/envs/rstar/eval_src/toolkit_for_MATH/metamath_utils.py
--- a/openr/envs/rstar/eval_src/toolkit_for_MATH/parsing_lib.py
+++ b/openr/envs/rstar/eval_src/toolkit_for_MATH/parsing_lib.py
--- a/openr/envs/rstar/eval_src/toolkit_for_MATH/simple_answer_check.py
+++ b/openr/envs/rstar/eval_src/toolkit_for_MATH/simple_answer_check.py
--- a/openr/envs/rstar/prompts/MATH/decompose/decompose_prompt.txt
+++ b/openr/envs/rstar/prompts/MATH/decompose/decompose_prompt.txt
--- a/openr/envs/rstar/prompts/MATH/decompose/decompose_prompt_rephrased.txt
+++ b/openr/envs/rstar/prompts/MATH/decompose/decompose_prompt_rephrased.txt
--- a/openr/envs/rstar/prompts/MATH/decompose/decompose_template.json
+++ b/openr/envs/rstar/prompts/MATH/decompose/decompose_template.json
--- a/openr/envs/rstar/prompts/MATH/fewshot_cot/fewshot_cot_config--xijie.json
+++ b/openr/envs/rstar/prompts/MATH/fewshot_cot/fewshot_cot_config--xijie.json
--- a/openr/envs/rstar/prompts/MATH/fewshot_cot/fewshot_cot_config.json
+++ b/openr/envs/rstar/prompts/MATH/fewshot_cot/fewshot_cot_config.json
--- a/openr/envs/rstar/prompts/MATH/fewshot_cot/fewshot_cot_prompt--xijie.txt
+++ b/openr/envs/rstar/prompts/MATH/fewshot_cot/fewshot_cot_prompt--xijie.txt
--- a/openr/envs/rstar/prompts/MATH/fewshot_cot/fewshot_cot_prompt.txt
+++ b/openr/envs/rstar/prompts/MATH/fewshot_cot/fewshot_cot_prompt.txt
--- a/openr/envs/rstar/prompts/MATH/fewshot_cot/fewshot_cot_prompt_old.txt
+++ b/openr/envs/rstar/prompts/MATH/fewshot_cot/fewshot_cot_prompt_old.txt
--- a/openr/envs/rstar/prompts/MATH/fewshot_cot/fewshot_cot_prompt_rephrased.txt
+++ b/openr/envs/rstar/prompts/MATH/fewshot_cot/fewshot_cot_prompt_rephrased.txt
--- a/openr/envs/rstar/prompts/MATH/fewshot_ost/fewshot_ost_config.json
+++ b/openr/envs/rstar/prompts/MATH/fewshot_ost/fewshot_ost_config.json
--- a/openr/envs/rstar/prompts/MATH/fewshot_ost/fewshot_ost_prompt.txt
+++ b/openr/envs/rstar/prompts/MATH/fewshot_ost/fewshot_ost_prompt.txt
--- a/openr/envs/rstar/prompts/MATH/rephrasing_prompt_template.txt
+++ b/openr/envs/rstar/prompts/MATH/rephrasing_prompt_template.txt
--- a/openr/envs/rstar/rstar_env.py
+++ b/openr/envs/rstar/rstar_env.py
--- a/openr/envs/rstar/rstar_utils.py
+++ b/openr/envs/rstar/rstar_utils.py
--- a/openr/envs/tests/test_math.py
+++ b/openr/envs/tests/test_math.py
--- a/openr/figure/MATH_subsampled.png
+++ b/openr/figure/MATH_subsampled.png
--- a/openr/figure/QA/QA1.png
+++ b/openr/figure/QA/QA1.png
--- a/openr/figure/QA/QA2.png
+++ b/openr/figure/QA/QA2.png
--- a/openr/figure/QA/QA3.png
+++ b/openr/figure/QA/QA3.png
--- a/openr/figure/QA/QA4.png
+++ b/openr/figure/QA/QA4.png
--- a/openr/figure/QA/QA5.png
+++ b/openr/figure/QA/QA5.png
--- a/openr/figure/QA/QA6.png
+++ b/openr/figure/QA/QA6.png
--- a/openr/figure/QA/QA7.png
+++ b/openr/figure/QA/QA7.png
--- a/openr/figure/compare_prm_by_boN.png
+++ b/openr/figure/compare_prm_by_boN.png
--- a/openr/figure/logo.png
+++ b/openr/figure/logo.png
--- a/openr/figure/logo_text.png
+++ b/openr/figure/logo_text.png
--- a/openr/figure/openr_logo.png
+++ b/openr/figure/openr_logo.png
--- a/openr/figure/wechat_qrcode.jpg
+++ b/openr/figure/wechat_qrcode.jpg
--- a/openr/gen_rm/fine_tuning.py
+++ b/openr/gen_rm/fine_tuning.py
--- a/openr/gen_rm/organize_dataset.py
+++ b/openr/gen_rm/organize_dataset.py
--- a/openr/infer.sh
+++ b/openr/infer.sh
 #!/bin/bash
+if [ ! -d "ret_one" ]; then
+    mkdir -p "ret_one"
+fi 

-#- Job parameters
-
-# (TODO)
-# Please modify job name
-
-#SBATCH -J inference            # The job name
-#SBATCH -o inference.out        # Write the standard output to file named 'ret-<job_number>.out'
-#SBATCH -e inference.err        # Write the standard error to file named 'ret-<job_number>.err'
-
-
-#- Resources
-
-# (TODO)
-# Please modify your requirements
-
-#SBATCH -p r8nv-gpu-hw                   # Submit to 'nv-gpu' Partitiion
-#SBATCH -t 1-06:00:00                # Run for a maximum time of 0 days, 12 hours, 00 mins, 00 secs
-#SBATCH --nodes=1                    # Request N nodes
-#SBATCH --gres=gpu:8                 # Request M GPU per node
-#SBATCH --gres-flags=enforce-binding # CPU-GPU Affinity
-#SBATCH --qos=gpu-normal           # Request QOS Type
-#SBATCH --constraint="L40S"
-
-###
-### The system will alloc 8 or 16 cores per gpu by default.
-### If you need more or less, use following:
-### #SBATCH --cpus-per-task=K            # Request K cores
-###
-### 
-### Without specifying the constraint, any available nodes that meet the requirement will be allocated
-### You can specify the characteristics of the compute nodes, and even the names of the compute nodes
-###
-### #SBATCH --nodelist=gpu-v00           # Request a specific list of hosts 
-### #SBATCH --constraint="Volta|RTX8000" # Request GPU Type: Volta(V100 or V100S) or RTX8000
-###
-
-#- Log information
-
-echo "Job start at $(date "+%Y-%m-%d %H:%M:%S")"
-echo "Job run at:"
-echo "$(hostnamectl)"
-# echo "$(df -h | grep -v tmpfs)"
-
-#- Load environments
-#- Load environments
-module unload cuda-cudnn
-source ~/.bashrc
-module list                       # list modules loaded
-conda activate open_reasoner
-
-echo $(module list)              # list modules loaded
-echo $(which gcc)
-echo $(which python)
-echo $(which python3)
-
-cluster-quota                    # nas quota
-
-# nvidia-smi --format=csv --query-gpu=name,driver_version,power.limit # gpu info
-nvidia-smi
-
-#- Warning! Please not change your CUDA_VISIBLE_DEVICES
-#- in `.bashrc`, `env.sh`, or your job script
-echo "Use GPU ${CUDA_VISIBLE_DEVICES}"                              # which gpus
-#- The CUDA_VISIBLE_DEVICES variable is assigned and specified by SLURM
-
-# export CUDA_DEVICE_ORDER="PCI_BUS_ID"
-
-#- Important setting!!!
-##  otherwise it will cause an error of insufficient RDMA resources:
-ulimit -l unlimited
-##  otherwise it will result in an insufficient virtual memory size error, especially when loading LLM:
-ulimit -v unlimited
-ulimit -n 65535
-ulimit -u 4125556
-
-#- Job step
-# sleep 30h
-cd /nfs_global/S/zhangxiaoyun/prm/openr
-export PYTHONPATH=$(pwd)
-
-bash reason/llm_service/create_service_qwen2.5_math_vllm_gold_prm_speed.sh --acc 1.0 --policy_model_name Qwen2.5-Math-7B-Instruct
-sleep 100s
-bash scripts/eval/beam_search.sh --acc 1.0 --policy_model_name Qwen2.5-Math-7B-Instruct
-
-#- End
-echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
+sbatch --job-name=test -o "ret_one/%j.out" -e "ret_one/%j.err" infer.slurm
\ No newline at end of file
--- a/openr/infer.slurm
+++ b/openr/infer.slurm
+#!/bin/bash
+
+#- Job parameters
+
+# (TODO)
+# Please modify job name
+
+#SBATCH -J inference            # The job name
+#SBATCH -o inference.out        # Write the standard output to file named 'ret-<job_number>.out'
+#SBATCH -e inference.err        # Write the standard error to file named 'ret-<job_number>.err'
+
+
+#- Resources
+
+# (TODO)
+# Please modify your requirements
+
+#SBATCH -p r8nv-gpu-hw                   # Submit to 'nv-gpu' Partitiion
+#SBATCH -t 1-06:00:00                # Run for a maximum time of 0 days, 12 hours, 00 mins, 00 secs
+#SBATCH --nodes=1                    # Request N nodes
+#SBATCH --gres=gpu:8                 # Request M GPU per node
+#SBATCH --gres-flags=enforce-binding # CPU-GPU Affinity
+#SBATCH --qos=gpu-normal           # Request QOS Type
+#SBATCH --constraint="L40"
+
+###
+### The system will alloc 8 or 16 cores per gpu by default.
+### If you need more or less, use following:
+### #SBATCH --cpus-per-task=K            # Request K cores
+###
+### 
+### Without specifying the constraint, any available nodes that meet the requirement will be allocated
+### You can specify the characteristics of the compute nodes, and even the names of the compute nodes
+###
+### #SBATCH --nodelist=gpu-v00           # Request a specific list of hosts 
+### #SBATCH --constraint="Volta|RTX8000" # Request GPU Type: Volta(V100 or V100S) or RTX8000
+###
+
+#- Log information
+
+echo "Job start at $(date "+%Y-%m-%d %H:%M:%S")"
+echo "Job run at:"
+echo "$(hostnamectl)"
+# echo "$(df -h | grep -v tmpfs)"
+
+#- Load environments
+#- Load environments
+module unload cuda-cudnn
+source ~/.bashrc
+module list                       # list modules loaded
+conda activate open_reasoner
+
+echo $(module list)              # list modules loaded
+echo $(which gcc)
+echo $(which python)
+echo $(which python3)
+
+cluster-quota                    # nas quota
+
+# nvidia-smi --format=csv --query-gpu=name,driver_version,power.limit # gpu info
+nvidia-smi
+
+#- Warning! Please not change your CUDA_VISIBLE_DEVICES
+#- in `.bashrc`, `env.sh`, or your job script
+echo "Use GPU ${CUDA_VISIBLE_DEVICES}"                              # which gpus
+#- The CUDA_VISIBLE_DEVICES variable is assigned and specified by SLURM
+
+# export CUDA_DEVICE_ORDER="PCI_BUS_ID"
+
+#- Important setting!!!
+##  otherwise it will cause an error of insufficient RDMA resources:
+ulimit -l unlimited
+##  otherwise it will result in an insufficient virtual memory size error, especially when loading LLM:
+ulimit -v unlimited
+ulimit -n 65535
+ulimit -u 20000
+
+#- Job step
+# sleep 30h
+cd /nfs_global/S/shiwenxuan/prm/openr
+export PYTHONPATH=$(pwd)
+
+bash reason/llm_service/create_service_qwen2.5_math_vllm_gold_prm_speed.sh --acc 0.3 --policy_model_name Qwen2.5-Math-1.5B-Instruct
+sleep 100s
+bash scripts/eval/beam_search.sh --acc 0.3 --policy_model_name Qwen2.5-Math-1.5B-Instruct
+
+# sleep 6h
+
+#- End
+echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
--- a/openr/preprocess/.mypy.ini
+++ b/openr/preprocess/.mypy.ini
--- a/openr/preprocess/.pytest.ini
+++ b/openr/preprocess/.pytest.ini
--- a/openr/preprocess/README.org
+++ b/openr/preprocess/README.org
--- a/openr/preprocess/cli.py
+++ b/openr/preprocess/cli.py
--- a/openr/preprocess/src/__init__.py
+++ b/openr/preprocess/src/__init__.py
--- a/openr/preprocess/src/data_types/__init__.py
+++ b/openr/preprocess/src/data_types/__init__.py
--- a/openr/preprocess/src/data_types/base.py
+++ b/openr/preprocess/src/data_types/base.py
--- a/openr/preprocess/src/data_types/converted.py
+++ b/openr/preprocess/src/data_types/converted.py
--- a/openr/preprocess/src/data_types/math_aps.py
+++ b/openr/preprocess/src/data_types/math_aps.py
--- a/openr/preprocess/src/data_types/math_shepherd.py
+++ b/openr/preprocess/src/data_types/math_shepherd.py
--- a/openr/preprocess/src/data_types/prm800k.py
+++ b/openr/preprocess/src/data_types/prm800k.py
--- a/openr/preprocess/src/data_types/utils.py
+++ b/openr/preprocess/src/data_types/utils.py
--- a/openr/preprocess/src/preprocessors/__init__.py
+++ b/openr/preprocess/src/preprocessors/__init__.py
--- a/openr/preprocess/src/preprocessors/base.py
+++ b/openr/preprocess/src/preprocessors/base.py
--- a/openr/preprocess/src/preprocessors/math_aps.py
+++ b/openr/preprocess/src/preprocessors/math_aps.py
--- a/openr/preprocess/src/preprocessors/math_shepherd.py
+++ b/openr/preprocess/src/preprocessors/math_shepherd.py
--- a/openr/preprocess/src/preprocessors/prm800k.py
+++ b/openr/preprocess/src/preprocessors/prm800k.py
--- a/openr/preprocess/src/preprocessors/utils.py
+++ b/openr/preprocess/src/preprocessors/utils.py
--- a/openr/preprocess/tests/__init__.py
+++ b/openr/preprocess/tests/__init__.py
--- a/openr/preprocess/tests/samples_math-aps-tree.jsonl
+++ b/openr/preprocess/tests/samples_math-aps-tree.jsonl
--- a/openr/preprocess/tests/samples_math-aps.jsonl
+++ b/openr/preprocess/tests/samples_math-aps.jsonl
--- a/openr/preprocess/tests/samples_math-shepherd.jsonl
+++ b/openr/preprocess/tests/samples_math-shepherd.jsonl
--- a/openr/preprocess/tests/samples_prm800k.jsonl
+++ b/openr/preprocess/tests/samples_prm800k.jsonl
--- a/openr/preprocess/tests/test_data_types.py
+++ b/openr/preprocess/tests/test_data_types.py
--- a/openr/preprocess/tests/test_preprocessors.py
+++ b/openr/preprocess/tests/test_preprocessors.py
--- a/openr/prm/README.md
+++ b/openr/prm/README.md
--- a/openr/prm/code/evaluate.py
+++ b/openr/prm/code/evaluate.py
--- a/openr/prm/code/evaluate_qwen.py
+++ b/openr/prm/code/evaluate_qwen.py
--- a/openr/prm/code/finetune_llama.py
+++ b/openr/prm/code/finetune_llama.py
--- a/openr/prm/code/finetune_math_shep.py
+++ b/openr/prm/code/finetune_math_shep.py
--- a/openr/prm/code/finetune_qwen.py
+++ b/openr/prm/code/finetune_qwen.py
--- a/openr/prm/code/finetune_qwen_single_gpu.py
+++ b/openr/prm/code/finetune_qwen_single_gpu.py
--- a/openr/prm/code/test.json
+++ b/openr/prm/code/test.json
--- a/openr/prm/code/test_qwen.py
+++ b/openr/prm/code/test_qwen.py
--- a/openr/prm/infer_fns.py
+++ b/openr/prm/infer_fns.py
@@ -11,7 +11,7 @@ lock = multiprocessing.Lock()
 print(id(lock), os.getpid())

 import redis
-redis_client = redis.Redis(host='127.0.0.1', port=20001, db=0)
+redis_client = redis.Redis(host='127.0.0.1', port=20002, db=0)

 def set_shared_value(key, value):
    redis_client.set(key, value)
@@ -71,6 +71,8 @@ def _qwen_math_gold_infer_fn(input_str: str, model, tokenizer, device, acc):

    pattern = '<\|im_start\|>user\s*\n\s*(.*?)\s*<\|im_end\|>'
    match = re.search(pattern, input_str, re.DOTALL)
+    if match is None:
+        print(input_str)
    assert match is not None, f"No match found for pattern: {pattern}"
    question = match.group(1)
    solution = question_item_map[question]["solution"]

--- a/openr/prm/offline_eval.py
+++ b/openr/prm/offline_eval.py
--- a/openr/prm/record.jsonl
+++ b/openr/prm/record.jsonl
--- a/openr/reason/README.md
+++ b/openr/reason/README.md
--- a/openr/reason/controller.log
+++ b/openr/reason/controller.log
-2024-09-28 13:31:28 | INFO | controller | args: Namespace(host='0.0.0.0', port=28777, dispatch_method='shortest_queue', ssl=False)
-2024-09-28 13:31:28 | ERROR | stderr | [32mINFO[0m:     Started server process [[36m821247[0m]
-2024-09-28 13:31:28 | ERROR | stderr | [32mINFO[0m:     Waiting for application startup.
-2024-09-28 13:31:28 | ERROR | stderr | [32mINFO[0m:     Application startup complete.
-2024-09-28 13:31:28 | ERROR | stderr | [32mINFO[0m:     Uvicorn running on [1mhttp://0.0.0.0:28777[0m (Press CTRL+C to quit)
-2024-09-28 13:31:56 | INFO | controller | Register a new worker: http://0.0.0.0:40010
-2024-09-28 13:31:56 | INFO | controller | Register done: http://0.0.0.0:40010, {'model_names': ['math-shepherd-mistral-7b-prm'], 'speed': 1, 'queue_length': 0}
-2024-09-28 13:31:56 | INFO | stdout | [32mINFO[0m:     127.0.0.1:34020 - "[1mPOST /register_worker HTTP/1.1[0m" [32m200 OK[0m
-2024-09-28 13:32:20 | INFO | controller | Register a new worker: http://0.0.0.0:30010
-2024-09-28 13:32:20 | INFO | controller | Register done: http://0.0.0.0:30010, {'model_names': ['mistral-7b-sft'], 'speed': 1, 'queue_length': 0}
-2024-09-28 13:32:20 | INFO | stdout | [32mINFO[0m:     127.0.0.1:53512 - "[1mPOST /register_worker HTTP/1.1[0m" [32m200 OK[0m
-2024-09-28 13:32:41 | INFO | controller | Receive heart beat. http://0.0.0.0:40010
-2024-09-28 13:32:41 | INFO | stdout | [32mINFO[0m:     127.0.0.1:50970 - "[1mPOST /receive_heart_beat HTTP/1.1[0m" [32m200 OK[0m
-2024-09-28 13:33:05 | INFO | controller | Receive heart beat. http://0.0.0.0:30010
-2024-09-28 13:33:05 | INFO | stdout | [32mINFO[0m:     127.0.0.1:48436 - "[1mPOST /receive_heart_beat HTTP/1.1[0m" [32m200 OK[0m
-2024-09-28 13:33:26 | INFO | controller | Receive heart beat. http://0.0.0.0:40010
-2024-09-28 13:33:26 | INFO | stdout | [32mINFO[0m:     127.0.0.1:35488 - "[1mPOST /receive_heart_beat HTTP/1.1[0m" [32m200 OK[0m
-2024-09-28 13:33:50 | INFO | controller | Receive heart beat. http://0.0.0.0:30010
-2024-09-28 13:33:50 | INFO | stdout | [32mINFO[0m:     127.0.0.1:46292 - "[1mPOST /receive_heart_beat HTTP/1.1[0m" [32m200 OK[0m
-2024-09-28 13:34:03 | INFO | controller | names: ['http://0.0.0.0:30010'], queue_lens: [0.0], ret: http://0.0.0.0:30010
-2024-09-28 13:34:03 | INFO | stdout | [32mINFO[0m:     127.0.0.1:41020 - "[1mPOST /get_worker_address HTTP/1.1[0m" [32m200 OK[0m
-2024-09-28 13:34:05 | INFO | controller | names: ['http://0.0.0.0:40010'], queue_lens: [0.0], ret: http://0.0.0.0:40010
-2024-09-28 13:34:05 | INFO | stdout | [32mINFO[0m:     127.0.0.1:41034 - "[1mPOST /get_worker_address HTTP/1.1[0m" [32m200 OK[0m
-2024-09-28 13:34:11 | INFO | controller | Receive heart beat. http://0.0.0.0:40010
-2024-09-28 13:34:11 | INFO | stdout | [32mINFO[0m:     127.0.0.1:41044 - "[1mPOST /receive_heart_beat HTTP/1.1[0m" [32m200 OK[0m
-2024-09-28 13:34:35 | INFO | controller | Receive heart beat. http://0.0.0.0:30010
-2024-09-28 13:34:35 | INFO | stdout | [32mINFO[0m:     127.0.0.1:44238 - "[1mPOST /receive_heart_beat HTTP/1.1[0m" [32m200 OK[0m
--- a/openr/reason/evaluation/evaluate.py
+++ b/openr/reason/evaluation/evaluate.py
@@ -78,7 +78,8 @@ if __name__ == "__main__":
    else:
        # assume qwen
        prm_step_tag = "\n\n\n\n\n "
-    prm_format_str = "{question} {answer}"
+    # prm_format_str = "{question} {answer}"
+    prm_format_str = "<|im_start|>user\n{question}<|im_end|> {answer}"

    if "qwen" in config.LM.lower():
        lm_step_tag = "\n\n"

--- a/openr/reason/evaluation/evaluator.py
+++ b/openr/reason/evaluation/evaluator.py
--- a/openr/reason/evaluation/methods.py
+++ b/openr/reason/evaluation/methods.py
--- a/openr/reason/evaluation/utils.py
+++ b/openr/reason/evaluation/utils.py
--- a/openr/reason/guided_search/rstar.py
+++ b/openr/reason/guided_search/rstar.py
--- a/openr/reason/guided_search/tree.py
+++ b/openr/reason/guided_search/tree.py
@@ -430,14 +430,22 @@ class SearchTree:
            self._expand_leaf_node(root, simulate_env, reward_model_fn)
            self.root = root

-        end_nodes, top_k_nodes = [], [(-root._initial_value, root, simulate_env.copy())]
+        simulate_env_copy = simulate_env.copy()
+        print(simulate_env_copy.config["max_actions"])
+        simulate_env_copy.config["max_actions"] = int(simulate_env_copy.config["max_actions"] / beam_size)
+        print(simulate_env_copy.config["max_actions"])
+        # end_nodes, top_k_nodes = [], [(-root._initial_value, root, simulate_env.copy())]
+        end_nodes, top_k_nodes = [], [(-root._initial_value, root, simulate_env_copy)]
        k = beam_size
+        print("k: ", k)

        for _ in range(max_step + 1):
            cur_nodes_to_search = top_k_nodes
            top_k_nodes = []
            for cur_neg_v, cur_node, cur_env in cur_nodes_to_search:
+                print("cur_node.children_num: ", len(cur_node.children))
                if cur_node.terminated:
+                    print("signal for k-1")
                    end_nodes.append((cur_neg_v, cur_node, cur_env))
                    k -= 1
                elif k > 0:
@@ -454,6 +462,7 @@ class SearchTree:
                        key=lambda x: x[2],
                        reverse=True,
                    )[:k]
+                    print("top_k_children_num: ", len(top_k_children))
                    for c_act, c_node, c_value in top_k_children:
                        new_env = cur_env.copy()
                        heapq.heappush(top_k_nodes, (-c_value, c_node, new_env))
@@ -679,7 +688,7 @@ class SearchTree:
            prms = reward_fn(
                [
                    (
-                        f'<|im_start|>user\n{simulate_env.question}<|im_end|>',
+                        simulate_env.question,
                        simulate_env.answer + x["action"],
                    )
                    for x in simulate_env.legal_actions

--- a/openr/reason/inference/lm_call.py
+++ b/openr/reason/inference/lm_call.py
--- a/openr/reason/inference/rm_call.py
+++ b/openr/reason/inference/rm_call.py
--- a/openr/reason/inference/text_generation.py
+++ b/openr/reason/inference/text_generation.py
--- a/openr/reason/inference/value.py
+++ b/openr/reason/inference/value.py
--- a/openr/reason/llm_service/README.md
+++ b/openr/reason/llm_service/README.md
--- a/openr/reason/llm_service/create_service_math_shepherd.sh
+++ b/openr/reason/llm_service/create_service_math_shepherd.sh
--- a/openr/reason/llm_service/create_service_qwen2.5_math_hf.sh
+++ b/openr/reason/llm_service/create_service_qwen2.5_math_hf.sh
--- a/openr/reason/llm_service/create_service_qwen2.5_math_vllm.sh
+++ b/openr/reason/llm_service/create_service_qwen2.5_math_vllm.sh
--- a/openr/reason/llm_service/create_service_qwen2.5_math_vllm_gold_prm.sh
+++ b/openr/reason/llm_service/create_service_qwen2.5_math_vllm_gold_prm.sh
--- a/openr/reason/llm_service/create_service_qwen2.5_math_vllm_gold_prm_speed.sh
+++ b/openr/reason/llm_service/create_service_qwen2.5_math_vllm_gold_prm_speed.sh
@@ -6,11 +6,11 @@ echo "RAY TEMP DIR is $RAY_TEMP_DIR"
 HOST_ADDR=0.0.0.0
 CONTROLER_PORT=28777
 WORKER_BASE_PORT=30010
-ACC=1.0
+ACC=0.5

 MODEL_BASE=/share/collab/codemodel/models
 CUDA_DEVICE_BASE=0
-POLICY_MODEL_NAME=Qwen2.5-Math-7B-Instruct
+POLICY_MODEL_NAME=Qwen2.5-Math-1.5B-Instruct

 while [[ "$#" -gt 0 ]]; do
    case $1 in

--- a/openr/reason/llm_service/workers/base_model_worker.py
+++ b/openr/reason/llm_service/workers/base_model_worker.py
--- a/openr/reason/llm_service/workers/gold_reward_model_worker.py
+++ b/openr/reason/llm_service/workers/gold_reward_model_worker.py
--- a/openr/reason/llm_service/workers/inference.py
+++ b/openr/reason/llm_service/workers/inference.py
--- a/openr/reason/llm_service/workers/model_worker.py
+++ b/openr/reason/llm_service/workers/model_worker.py
--- a/openr/reason/llm_service/workers/reward_model_worker.py
+++ b/openr/reason/llm_service/workers/reward_model_worker.py
--- a/openr/reason/llm_service/workers/test.ipynb
+++ b/openr/reason/llm_service/workers/test.ipynb
--- a/openr/reason/llm_service/workers/vllm_worker.py
+++ b/openr/reason/llm_service/workers/vllm_worker.py
--- a/openr/reason/reranking/vote_utils.py
+++ b/openr/reason/reranking/vote_utils.py
--- a/openr/reason/test.py
+++ b/openr/reason/test.py
--- a/openr/reports/OpenR-Wang.pdf
+++ b/openr/reports/OpenR-Wang.pdf
--- a/openr/reports/Tutorial-LLM-Reasoning-Wang.pdf
+++ b/openr/reports/Tutorial-LLM-Reasoning-Wang.pdf
--- a/openr/reports/tutorial.pdf
+++ b/openr/reports/tutorial.pdf
--- a/openr/requirements.txt
+++ b/openr/requirements.txt
--- a/openr/scripts/eval/beam_search.sh
+++ b/openr/scripts/eval/beam_search.sh
 export RAY_TEMP_DIR="/tmp/ray_$SLURM_JOBID"
 echo "RAY TEMP DIR is $RAY_TEMP_DIR"

-POLICY_MODEL_NAME=Qwen2.5-Math-7B-Instruct
-ACC=1.0
+POLICY_MODEL_NAME=Qwen2.5-Math-1.5B-Instruct
+ACC=0.5

 while [[ "$#" -gt 0 ]]; do
    case $1 in
@@ -15,7 +15,7 @@ done


 VALUE_MODEL_NAME=${POLICY_MODEL_NAME}_RM
-SAVE_DIR="results/${POLICY_MODEL_NAME}/${ACC}"
+SAVE_DIR="/nfs_global/S/shiwenxuan/prm/openr/results/${POLICY_MODEL_NAME}/${ACC}"

 echo "POLICY_MODEL_NAME is $POLICY_MODEL_NAME"
 echo "VALUE_MODEL_NAME is $VALUE_MODEL_NAME"
@@ -26,7 +26,7 @@ python reason/evaluation/evaluate.py \
    --task_name MATH \
    --temperature 0.7 \
    --max_new_tokens 2048 \
-    --num_sequence 1 \
+    --num_sequence 2 \
    --tree_max_width 4 \
    --tree_max_depth 50 \
    --save_dir $SAVE_DIR \

--- a/openr/scripts/eval/cot_greedy.sh
+++ b/openr/scripts/eval/cot_greedy.sh
--- a/openr/scripts/eval/cot_rerank.sh
+++ b/openr/scripts/eval/cot_rerank.sh
--- a/openr/scripts/eval/rstar_mcts.sh
+++ b/openr/scripts/eval/rstar_mcts.sh
--- a/openr/scripts/eval/vanila_mcts.sh
+++ b/openr/scripts/eval/vanila_mcts.sh
--- a/openr/train/README.md
+++ b/openr/train/README.md
--- a/openr/train/mat/__init__.py
+++ b/openr/train/mat/__init__.py
--- a/openr/train/mat/agents/qwen_lora_agent.py
+++ b/openr/train/mat/agents/qwen_lora_agent.py
--- a/openr/train/mat/config.py
+++ b/openr/train/mat/config.py
--- a/openr/train/mat/envs/__init__.py
+++ b/openr/train/mat/envs/__init__.py
--- a/openr/train/mat/envs/math/__init__.py
+++ b/openr/train/mat/envs/math/__init__.py
--- a/openr/train/mat/envs/math/data/__init__.py
+++ b/openr/train/mat/envs/math/data/__init__.py
--- a/openr/train/mat/envs/math/data/math_500.jsonl
+++ b/openr/train/mat/envs/math/data/math_500.jsonl
--- a/openr/train/mat/envs/math/data/merged_precalculus_test.json
+++ b/openr/train/mat/envs/math/data/merged_precalculus_test.json
--- a/openr/train/mat/envs/math/data/merged_precalculus_train.json
+++ b/openr/train/mat/envs/math/data/merged_precalculus_train.json
--- a/openr/train/mat/envs/math/math_env.py
+++ b/openr/train/mat/envs/math/math_env.py
--- a/openr/train/mat/envs/math/math_env_wrappers.py
+++ b/openr/train/mat/envs/math/math_env_wrappers.py
--- a/openr/train/mat/envs/math/prompts.py
+++ b/openr/train/mat/envs/math/prompts.py
--- a/openr/train/mat/models/__init__.py
+++ b/openr/train/mat/models/__init__.py
--- a/openr/train/mat/models/critic.py
+++ b/openr/train/mat/models/critic.py
--- a/openr/train/mat/models/ms_prm.py
+++ b/openr/train/mat/models/ms_prm.py
--- a/openr/train/mat/models/qwen_prm.py
+++ b/openr/train/mat/models/qwen_prm.py
--- a/openr/train/mat/runner/shared/math_runner.py
+++ b/openr/train/mat/runner/shared/math_runner.py
--- a/openr/train/mat/scripts/__init__.py
+++ b/openr/train/mat/scripts/__init__.py
--- a/openr/train/mat/scripts/test_llm.sh
+++ b/openr/train/mat/scripts/test_llm.sh
--- a/openr/train/mat/scripts/test_math.py
+++ b/openr/train/mat/scripts/test_math.py
--- a/openr/train/mat/scripts/train_llm.sh
+++ b/openr/train/mat/scripts/train_llm.sh
--- a/openr/train/mat/scripts/train_math.py
+++ b/openr/train/mat/scripts/train_math.py
--- a/openr/train/mat/trainers/llm_trainer_appo.py
+++ b/openr/train/mat/trainers/llm_trainer_appo.py
--- a/openr/train/mat/trainers/llm_trainer_grpo.py
+++ b/openr/train/mat/trainers/llm_trainer_grpo.py
--- a/openr/train/mat/trainers/llm_trainer_tppo.py
+++ b/openr/train/mat/trainers/llm_trainer_tppo.py
--- a/openr/train/mat/utils/__init__.py
+++ b/openr/train/mat/utils/__init__.py
--- a/openr/train/mat/utils/language_buffer.py
+++ b/openr/train/mat/utils/language_buffer.py
--- a/openr/train/mat/utils/util.py
+++ b/openr/train/mat/utils/util.py
--- a/result.md
+++ b/result.md
+### experiment
+
+- policy model: Qwen2.5-Math-1.5B-Instruct
+- reward model: Qwen2.5-Math-1.5B-Instruct
+- method: beam search
+- - ACC:1.0 result:[{"majority_vote": 0.828, "total_completion_tokens": 2544.756}]
+- - ACC:0.9 result:[{"majority_vote": 0.798, "total_completion_tokens": 2576.35}]
+- - ACC:0.8 result:[{"majority_vote": 0.794, "total_completion_tokens": 2497.672}]
+- - ACC:0.7 result:[{"majority_vote": 0.782, "total_completion_tokens": 2502.832}]
+- - ACC:0.6 result:[{"majority_vote": 0.76, "total_completion_tokens": 2491.27}]
+- - ACC:0.5 result:[{'majority_vote': 0.724, 'total_completion_tokens': 2400.16}]
+- - ACC:0.4 result:[{'majority_vote': 0.75, 'total_completion_tokens': 2418.876}]
+- - ACC:0.3 result:[{'majority_vote': 0.748, 'total_completion_tokens': 2463.39}]
+
+
--- a/results/README.md
+++ b/results/README.md
--- a/results/image.png
+++ b/results/image.png