Initial commit

e4d72d72 · Shi wenxuan · e4d72d72 · e4d72d72 · e4d72d72 · e4d72d72
Commit e4d72d72 authored Aug 07, 2025 by Shi wenxuan
511 changed files
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
+name: Build docs
+
+on:
+  push:
+    branches: ["main"]
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+permissions:
+  contents: read
+  pages: write
+  id-token: write
+
+# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
+# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
+concurrency:
+  group: "pages"
+  cancel-in-progress: false
+
+jobs:
+  # Build docs and deploy to the website
+  deploy:
+    environment:
+      name: github-pages
+      url: ${{ steps.deployment.outputs.page_url }}
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Configure Git Credentials
+        run: |
+          git config user.name github-actions[bot]
+          git config user.email 41898282+github-actions[bot]@users.noreply.github.com
+      - uses: actions/setup-python@v5
+        with:
+          python-version: 3.x
+      - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
+      - uses: actions/cache@v4
+        with:
+          key: mkdocs-material-${{ env.cache_id }}
+          path: .cache
+          restore-keys: |
+            mkdocs-material-
+      - run: pip install -r requirements/docs.txt
+      - run: mkdocs build
+      - name: Upload artifact
+        uses: actions/upload-pages-artifact@v3
+        with:
+          path: 'site'
+      - name: Deploy to GitHub Pages
+        id: deployment
+        uses: actions/deploy-pages@v4
\ No newline at end of file
--- a/.github/workflows/gpu_tests.yml
+++ b/.github/workflows/gpu_tests.yml
+name: Integration tests
+
+on:
+  pull_request:
+    branches: [ "main" ]
+    types: [opened, synchronize, reopened, labeled]
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  gpu-tests-llama:
+    runs-on: self-hosted-nemo-gpus-1
+    if: ${{ github.event.label.name == 'run GPU tests' }}
+    steps:
+    - name: Cleanup old containers
+      run: |
+        docker system prune --all --filter "until=360h" --force
+    - uses: actions/checkout@v3
+      with:
+        path: ${{ github.run_id }}
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.10"
+    - name: Install dependencies
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      run: |
+        cd ${{ github.run_id }}
+        python -m pip install --upgrade pip
+        pip uninstall -y nemo-skills nemo_run
+        pip install -e .
+        pip install -r requirements/common-tests.txt
+        ns prepare_data gsm8k human-eval mbpp algebra222 mmlu ifeval math-500 amc23 aime24
+    - name: Run GPU tests
+      timeout-minutes: 180
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      run: |
+        cd ${{ github.run_id }}
+        nvidia-smi
+        set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
+        ./tests/gpu-tests/run_llama.sh
+    - name: Cleanup
+      if: always()
+      run: |
+        docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.6.1 bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/'
+        docker ps -a -q | xargs -r docker stop
+
+  gpu-tests-qwen:
+    runs-on: self-hosted-nemo-gpus-1
+    if: ${{ github.event.label.name == 'run GPU tests' }}
+    steps:
+    - name: Cleanup old containers
+      run: |
+        docker system prune --all --filter "until=360h" --force
+    - uses: actions/checkout@v3
+      with:
+        path: ${{ github.run_id }}
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.10"
+    - name: Install dependencies
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      run: |
+        cd ${{ github.run_id }}
+        python -m pip install --upgrade pip
+        pip uninstall -y nemo-skills nemo_run
+        pip install -e .
+        pip install -r requirements/common-tests.txt
+        ns prepare_data gsm8k human-eval mbpp algebra222 mmlu ifeval math-500 amc23 aime24
+    - name: Run GPU tests
+      timeout-minutes: 180
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      run: |
+        cd ${{ github.run_id }}
+        nvidia-smi
+        set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
+        ./tests/gpu-tests/run_qwen.sh
+    - name: Cleanup
+      if: always()
+      run: |
+        docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.6.1 bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/'
+        docker ps -a -q | xargs -r docker stop
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
+name: CPU tests
+
+on:
+  pull_request:
+    branches: [ "main" ]
+
+  # Allows you to run this workflow manually from the Actions tab
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  unit-tests:
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.10
+      uses: actions/setup-python@v3
+      with:
+        python-version: "3.10"
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -e .
+        pip install -r requirements/common-tests.txt
+    - name: Run all tests
+      env:
+        NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+      run: |
+        docker pull igitman/nemo-skills:0.6.1
+        docker run --rm --name=local-sandbox igitman/nemo-skills-sandbox:0.6.1 &
+        sleep 120
+        export NEMO_SKILLS_SANDBOX_HOST=`docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' local-sandbox`
+        set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
+        ns prepare_data gsm8k math-500
+        python -m pytest tests/ -m "not gpu" --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=nemo_skills --cov=pipeline --durations=30 -rs -s -vvv
--- a/.gitignore
+++ b/.gitignore
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+*.json
+*.tar.gz
+*.tar
+*.npy
+*.info
+*.jsonl
+*.csv
+nemo_experiments
+wandb
+build
+.hypothesis
+*.zip
+*.egg-info
+*.xml
+*.DS_Store
+.coverage
+.venv
+*.lock
+
+__pycache__
+.ipynb_checkpoints
+
+cluster_configs/*
+!cluster_configs/example-*.yaml
+
+nemo_skills/dataset/ruler/*/
+nemo_skills/dataset/bfcl_v3/*/
+.idea/
+.idea/*
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+default_language_version:
+  python: python3
+
+ci:
+  autofix_prs: true
+  autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
+  autoupdate_schedule: quarterly
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v5.0.0
+    hooks:
+      - id: check-yaml
+        exclude: ^mkdocs\.yml$
+      - id: check-case-conflict
+      - id: detect-private-key
+      - id: check-added-large-files
+        args: ['--maxkb=1000']
+      - id: requirements-txt-fixer
+
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.13.2
+    hooks:
+      - id: isort
+        name: Format imports
+        exclude: docs/
+        args: ["--profile", "black"]
+
+  - repo: https://github.com/psf/black
+    rev: 24.10.0
+    hooks:
+      - id: black
+        name: Format code
+        exclude: docs/source-app
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
+# Contributing To Nemo-Skills
+
+Thanks for your interest in contributing to Nemo-Skills!
+
+## Running Tests
+
+TBD
+
+## Code Quality
+
+- Follow the existing code style and conventions
+- Write tests for new features
+- Update documentation to reflect your changes
+- Ensure all tests pass before submitting a PR
+- Do not add arbitrary defaults for configs, be as explicit as possible.
+
+
+## Signing Your Work
+
+* We require that all contributors "sign-off" on their commits. This certifies that the contribution is your original work, or you have rights to submit it under the same license, or a compatible license.
+
+* Any contribution which contains commits that are not Signed-Off will not be accepted.
+
+* To sign off on a commit you simply use the `--signoff` (or `-s`) option when committing your changes:
+  ```bash
+  $ git commit -s -m "Add cool feature."
+  ```
+  This will append the following to your commit message:
+  ```
+  Signed-off-by: Your Name <your@email.com>
+  ```
+
+* Full text of the DCO:
+
+  ```
+  Developer Certificate of Origin
+  Version 1.1
+
+  Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
+
+  Everyone is permitted to copy and distribute verbatim copies of this
+  license document, but changing it is not allowed.
+
+
+  Developer's Certificate of Origin 1.1
+
+  By making a contribution to this project, I certify that:
+
+  (a) The contribution was created in whole or in part by me and I
+      have the right to submit it under the open source license
+      indicated in the file; or
+
+  (b) The contribution is based upon previous work that, to the best
+      of my knowledge, is covered under an appropriate open source
+      license and I have the right under that license to submit that
+      work with modifications, whether created in whole or in part
+      by me, under the same open source license (unless I am
+      permitted to submit under a different license), as indicated
+      in the file; or
+
+  (c) The contribution was provided directly to me by some other
+      person who certified (a), (b) or (c) and I have not modified
+      it.
+
+  (d) I understand and agree that this project and the contribution
+      are public and that a record of the contribution (including all
+      personal information I submit with it, including my sign-off) is
+      maintained indefinitely and may be redistributed consistent with
+      this project or the open source license(s) involved.
+  ```
\ No newline at end of file
--- a/LICENSE
+++ b/LICENSE
--- a/MANIFEST.in
+++ b/MANIFEST.in
+recursive-include nemo_skills *.yaml
+recursive-include nemo_skills *.txt
\ No newline at end of file
--- a/README.md
+++ b/README.md
+# NeMo Skills
+
+NeMo-Skills is a collection of pipelines to improve "skills" of large language models (LLMs). We support everything needed for LLM development, from synthetic data generation, to model training, to evaluation on a wide range of benchmarks. Start developing on a local workstation and move to a large-scale Slurm cluster with just a one-line change.
+
+Here are some of the features we support:
+
+- [Flexible LLM inference](https://nvidia.github.io/NeMo-Skills/pipelines/generation/):
+  - Seamlessly switch between API providers, local server and large-scale slurm jobs for LLM inference.
+  - Host models (on 1 or many nodes) with [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [vLLM](https://github.com/vllm-project/vllm), [sglang](https://github.com/sgl-project/sglang) or [Megatron](https://github.com/NVIDIA/Megatron-LM).
+  - Scale SDG jobs from 1 GPU on a local machine all the way to tens of thousands of GPUs on a slurm cluster.
+- [Model evaluation](https://nvidia.github.io/NeMo-Skills/pipelines/evaluation):
+  - Evaluate your models on many popular benchmarks.
+    - Math problem solving: hmmt_feb25, brumo25, aime24, aime25, omni-math (and many more)
+    - Formal proofs in Lean: minif2f, proofnet
+    - Coding skills: scicode, livecodebench, human-eval, mbpp
+    - Chat/instruction following: ifbench, ifeval, arena-hard
+    - General knowledge: mmlu, mmlu-pro, gpqa
+    - Long context: ruler
+  - Easily parallelize each evaluation across many slurm jobs, self-host LLM judges, bring your own prompts or change benchmark configuration in any other way.
+- [Model training](https://nvidia.github.io/NeMo-Skills/pipelines/training): Train models using [NeMo-Aligner](https://github.com/NVIDIA/NeMo-Aligner/), [NeMo-RL](https://github.com/NVIDIA/NeMo-RL/) or [verl](https://github.com/volcengine/verl).
+
+## News
+
+* [07/30/2025]: The datasets used to train OpenReasoning models are released! Math and code are available as part of [Nemotron-Post-Training-Dataset-v1](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v1) and science is available in
+[OpenScienceReasoning-2](https://huggingface.co/datasets/nvidia/OpenScienceReasoning-2).
+See our [documentation](https://nvidia.github.io/NeMo-Skills/releases/openreasoning/training) for more details.
+
+* [07/18/2025]: We released [OpenReasoning](https://nvidia.github.io/NeMo-Skills/releases/openreasoning/) models! SOTA scores on math, coding and science benchmarks.
+
+![Evaluation Results with pass@1](docs/releases/openreasoning/pass-1.png)
+
+![Evaluation Results with GenSelect](docs/releases/openreasoning/genselect.png)
+
+
+* [04/23/2025]: We released [OpenMathReasoning](https://nvidia.github.io/NeMo-Skills/openmathreasoning1) dataset and models!
+
+  * OpenMathReasoning dataset has 306K unique mathematical problems sourced from [AoPS forums](https://artofproblemsolving.com/community) with:
+      * 3.2M long chain-of-thought (CoT) solutions
+      * 1.7M long tool-integrated reasoning (TIR) solutions
+      * 566K samples that select the most promising solution out of many candidates (GenSelect)
+  * OpenMath-Nemotron models are SoTA open-weight models on math reasoning benchmarks at the time of release!
+
+* [10/03/2024]: We released [OpenMathInstruct-2](https://nvidia.github.io/NeMo-Skills/openmathinstruct2) dataset and models!
+
+  * OpenMathInstruct-2 is a math instruction tuning dataset with 14M problem-solution pairs generated using the Llama3.1-405B-Instruct model.
+  * OpenMath-2-Llama models show significant improvements compared to their Llama3.1-Instruct counterparts.
+
+## Getting started
+
+To get started, follow these [steps](https://nvidia.github.io/NeMo-Skills/basics),
+browse available [pipelines](https://nvidia.github.io/NeMo-Skills/pipelines) or run `ns --help` to see all available
+commands and their options.
+
+You can find more examples of how to use NeMo-Skills in the [tutorials](https://nvidia.github.io/NeMo-Skills/tutorials) page.
+
+We've built and released many popular models and datasets using NeMo-Skills. See all of them in the [Papers & Releases](./releases/index.md) documentation.
+
+You can find the full documentation [here](https://nvidia.github.io/NeMo-Skills/).
+
+
+## Contributing
+
+We welcome contributions to NeMo-Skills! Please see our [Contributing Guidelines](./CONTRIBUTING.md) for more information on how to get involved.
+
+
+Disclaimer: This project is strictly for research purposes, and not an official product from NVIDIA.
--- a/cluster_configs/example-local.yaml
+++ b/cluster_configs/example-local.yaml
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+executor: local
+
+containers:
+  trtllm: nvcr.io/nvidia/tensorrt-llm/release:0.21.0
+  vllm: igitman/nemo-skills-vllm:0.7.0
+  sglang: igitman/nemo-skills-sglang:0.7.0
+  nemo: igitman/nemo-skills-nemo:0.7.0
+  megatron: igitman/nemo-skills-megatron:0.7.0
+  sandbox: igitman/nemo-skills-sandbox:0.7.0
+  nemo-skills: igitman/nemo-skills:0.7.0
+  verl: igitman/nemo-skills-verl:0.7.0
+  nemo-rl: igitman/nemo-skills-nemo-rl:0.7.0
+
+# add required mounts for models/data here
+# the code is mounted automatically inside /nemo_run/code
+# but please note that we only package what's tracked by git + jsonl files inside nemo_skills/dataset
+
+# mounts:
+# you can define as many as you need, e.g.
+#   - /mnt/datadrive/models:/models
+#   - /mnt/datadrive/data:/data
+#   - /home/<username>/workspace:/workspace
+#   you can also override container libraries by directly mounting over them. E.g. to override NeMo-Aligner do
+#   - <...>/NeMo-Aligner:/opt/NeMo-Aligner
\ No newline at end of file
--- a/cluster_configs/example-slurm.yaml
+++ b/cluster_configs/example-slurm.yaml
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+executor: slurm
+
+containers:
+  trtllm: nvcr.io/nvidia/tensorrt-llm/release:0.21.0
+  vllm: igitman/nemo-skills-vllm:0.7.0
+  sglang: igitman/nemo-skills-sglang:0.7.0
+  nemo: igitman/nemo-skills-nemo:0.7.0
+  megatron: igitman/nemo-skills-megatron:0.7.0
+  sandbox: igitman/nemo-skills-sandbox:0.7.0
+  nemo-skills: igitman/nemo-skills:0.7.0
+  verl: igitman/nemo-skills-verl:0.7.0
+  nemo-rl: igitman/nemo-skills-nemo-rl:0.7.0
+
+job_name_prefix: "nemo_skills:"
+
+
+# define this for ssh access
+# ssh_tunnel:
+#   host: <slurm host>
+#   user: <username>
+#   job_dir: <some location on slurm cluster to keep job metadata, uploaded code and generated sbatch files>
+#   identity: <can specify ssh key to avoid entering password>
+
+
+# if you're running directly from cluster, you only need to define job_dir and shouldn't use ssh_tunnel
+# job_dir: <some location on slurm cluster to keep job metadata, uploaded code and generated sbatch files>
+
+
+# define your account/partition here
+# account: <slurm account>
+# partition: <slurm partition>
+# cpu_partition: <if cluster has a dedicated cpu partition, you can define it here>
+
+
+# add required mounts for models/data here
+# the code is mounted automatically inside /nemo_run/code
+# but please note that we only package what's tracked by git + jsonl files inside nemo_skills/dataset
+
+# mounts:
+#   - <slurm location for your data/models>:<where to mount in a container>
+#   e.g.
+#   - <path on slurm>/trt_models:/trt_models
+#   - <path on slurm>/data:/data
+#   you can also override container libraries by directly mounting over them. E.g. to override NeMo-Aligner do
+#   - <path on slurm>/NeMo-Aligner:/opt/NeMo-Aligner
+
+
+# can use this section to set timeouts for different partitions
+# this will be used as a slurm parameter + to signal SFT job to finish
+# before the timeout to have time to save the last checkpoint
+# timeouts:
+#   partition_name1: 06:00:00
+#   partition_name2: 01:30:00
--- a/dataset_explorer_demo/README.md
+++ b/dataset_explorer_demo/README.md
+# Dataset Explorer Demo
+
+1. Download data TBD
+2. Retrieve similar questions from OpenMathInstruct2. Do it for all benchmarks you want to compare against.
+   Assuming you're running from this folder.
+
+   ```
+   python -m nemo_skills.inference.retrieve_similar \
+       ++retrieve_from=./data.jsonl \
+       ++compare_to="../nemo_skills/dataset/<benchmark>/test.jsonl" \
+       ++output_file=./similar-retrieved-openmath2/<benchmark>.jsonl \
+       ++top_k=5
+   ```
+
+3. Let's do the same for original MATH training set to get a sense of whether OpenMathInstruct-2 is in the same
+   distribution or not.
+
+   ```
+   python -m nemo_skills.inference.retrieve_similar \
+       ++retrieve_from=../nemo_skills/dataset/math/train.jsonl \
+       ++compare_to="../nemo_skills/dataset/<benchmark>/test.jsonl" \
+       ++output_file=./similar-retrieved-math-train/<benchmark>.jsonl \
+       ++top_k=5
+   ```
+
+4. Start the Gradio demo.
+
+   ```
+   python visualize_similar.py
+   ```
\ No newline at end of file
--- a/dataset_explorer_demo/visualize_similar.py
+++ b/dataset_explorer_demo/visualize_similar.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import random
+import re
+from functools import lru_cache
+
+import gradio as gr
+from latex2mathml.converter import convert
+from latex2mathml.exceptions import NoAvailableTokensError
+
+
+@lru_cache(maxsize=1000)
+def load_jsonl(file_path):
+    with open(file_path, 'r') as f:
+        return [json.loads(line) for line in f]
+
+
+@lru_cache(maxsize=10000)
+def render_latex(text):
+    def replace_matrix(match):
+        matrix_content = match.group(1)
+        rows = matrix_content.split('\\\\')
+        mml_rows = ''.join(f'<mtr><mtd>{convert_and_clean(row.strip())}</mtd></mtr>' for row in rows)
+        return f'<mrow><mo>(</mo><mtable rowspacing="4pt" columnspacing="1em">{mml_rows}</mtable><mo>)</mo></mrow>'
+
+    def replace_align(match):
+        align_content = match.group(1)
+        rows = align_content.split('\\\\')
+        mml_rows = []
+        for row in rows:
+            if '&' in row:
+                left, right = row.split('&')
+                mml_row = f'<mtr><mtd columnalign="right">{convert_and_clean(left.strip())}</mtd><mtd columnalign="left">{convert_and_clean(right.strip())}</mtd></mtr>'
+            else:
+                mml_row = f'<mtr><mtd columnalign="center">{convert_and_clean(row.strip())}</mtd></mtr>'
+            mml_rows.append(mml_row)
+        return f'<mtable columnspacing="1em" rowspacing="3pt" displaystyle="true">{"".join(mml_rows)}</mtable>'
+
+    def convert_and_clean(latex):
+        try:
+            # Pre-process nested matrices
+            latex = re.sub(r'\\begin{pmatrix}(.*?)\\end{pmatrix}', replace_matrix, latex, flags=re.DOTALL)
+
+            # Handle \displaystyle
+            latex = latex.replace('\\displaystyle', '')
+
+            # Handle nested exponents
+            latex = re.sub(r'\^{([^{}]+)}', r'^{\1}', latex)
+
+            # Convert LaTeX to MathML
+            mathml = convert(latex)
+            mathml = re.sub(r'<math.*?>(.*)</math>', r'\1', mathml)
+            return mathml
+        except NoAvailableTokensError:
+            return latex
+
+    # Handle align* environment
+    text = re.sub(
+        r'\\begin{align\*}(.*?)\\end{align\*}',
+        lambda m: f'<math xmlns="http://www.w3.org/1998/Math/MathML" display="block">{replace_align(m)}</math>',
+        text,
+        flags=re.DOTALL,
+    )
+
+    # Handle display math, excluding intervals
+    text = re.sub(
+        r'\[(?![-\d, ]+\])(.*?)\]',
+        lambda m: f'<math xmlns="http://www.w3.org/1998/Math/MathML" display="block">{convert_and_clean(m.group(1))}</math>',
+        text,
+        flags=re.DOTALL,
+    )
+
+    # Handle inline math
+    text = re.sub(
+        r'\$(.*?)\$',
+        lambda m: f'<math xmlns="http://www.w3.org/1998/Math/MathML">{convert_and_clean(m.group(1))}</math>',
+        text,
+    )
+
+    return text
+
+
+@lru_cache(maxsize=1000)
+def display_entry(index, test_set):
+    data_openmath2, data_math_train = load_test_sets(f"{test_set}.jsonl")
+
+    # Check if the index is valid
+    if index < 0 or index >= len(data_openmath2):
+        return f"Error: Invalid index. Please enter a number between 0 and {len(data_openmath2) - 1}."
+
+    entry_openmath2 = data_openmath2[index]
+    entry_math_train = data_math_train[index]
+
+    # Check if the current test set is GSM8K
+    if test_set == "gsm8k":
+        test_problem = entry_openmath2['problem']
+        similar_openmath2 = entry_openmath2['similar_items']
+        similar_math_train = entry_math_train['similar_items']
+    else:
+        test_problem = render_latex(entry_openmath2['problem'])
+        similar_openmath2 = [render_latex(cand) for cand in entry_openmath2['similar_items']]
+        similar_math_train = [render_latex(cand) for cand in entry_math_train['similar_items']]
+
+    html = f"<h2>Test set problem:</h2><p>{test_problem}</p>"
+    html += "<hr>"
+    html += "<div style='display: flex;'>"
+    html += "<div style='flex: 1; padding-right: 10px;'>"
+    html += "<h2>Most similar OpenMathInstruct-2 problems:</h2><ol>"
+    for cand in similar_openmath2:
+        html += f"<li>{cand}</li>"
+    html += "</ol></div>"
+    html += "<div style='border-left: 1px solid #ccc;'></div>"
+    html += "<div style='flex: 1; padding-left: 10px;'>"
+    html += "<h2>Most similar MATH training set problems:</h2><ol>"
+    for cand in similar_math_train:
+        html += f"<li>{cand}</li>"
+    html += "</ol></div>"
+    html += "</div>"
+
+    return html
+
+
+def random_entry(data):
+    return random.randint(0, len(data) - 1)
+
+
+@lru_cache(maxsize=10)
+def load_test_sets(test_set):
+    file_path_openmath2 = f'./similar-retrieved-openmath2/{test_set}'
+    file_path_math_train = f'./similar-retrieved-math-train/{test_set}'
+
+    data_openmath2 = load_jsonl(file_path_openmath2)
+    data_math_train = load_jsonl(file_path_math_train)
+
+    # Sort both datasets based on the 'problem' field (or use 'id' if available)
+    data_openmath2.sort(key=lambda x: x['problem'])
+    data_math_train.sort(key=lambda x: x['problem'])
+
+    # Check if the sorted datasets have the same length and matching problems
+    if len(data_openmath2) != len(data_math_train):
+        raise ValueError(
+            f"Datasets have different lengths: OpenMathInstruct-2 ({len(data_openmath2)}) vs MATH training set ({len(data_math_train)})"
+        )
+
+    for i, (entry_openmath2, entry_math_train) in enumerate(zip(data_openmath2, data_math_train)):
+        if entry_openmath2['problem'] != entry_math_train['problem']:
+            raise ValueError(
+                f"Mismatch at index {i}: OpenMathInstruct-2 problem doesn't match MATH training set problem"
+            )
+
+    return data_openmath2, data_math_train
+
+
+test_sets = [f for f in os.listdir('./similar-retrieved-openmath2') if f.endswith('.jsonl')]
+test_set_names = [os.path.splitext(f)[0] for f in test_sets]
+
+if "math.jsonl" in test_sets:
+    test_sets.remove("math.jsonl")
+    test_sets.insert(0, "math.jsonl")
+    test_set_names = [os.path.splitext(f)[0] for f in test_sets]
+
+with gr.Blocks() as demo:
+    gr.Markdown("# OpenMathInstruct-2 test set contamination explorer")
+    gr.Markdown(
+        "During construction of OpenMathInstruct-2 we generated many synthetic problems. "
+        "We did a very thorough decontamination to remove exact duplicates (including rephrases) with popular benchmarks.<br>"
+        "Still our dataset contains many questions that are very similar to test sets. "
+        "To make things more transparent we created this demo, that you can use to explore "
+        "most similar questions from our data for each of the test set problems.<br>"
+        "We also provide closest examples from MATH training set, since it was used as seed data "
+        "to create our dataset and in most cases that training set already contains very similar questions to the test sets!<br>"
+        "See our full dataset at HuggingFace: [OpenMathInstruct-2](https://huggingface.co/datasets/nvidia/OpenMathInstruct-2)<br>"
+        "And read our [paper](https://arxiv.org/abs/2410.01560) to learn more about the decontamination process and how we retrieve similar questions."
+    )
+
+    warning_box = gr.Markdown(visible=False)
+
+    with gr.Row():
+        test_set_dropdown = gr.Dropdown(choices=test_set_names, label="Select Test Set", value=test_set_names[0])
+        index_input = gr.Number(label="Problem Index", value=0, step=1)
+        random_button = gr.Button("Random Problem")
+
+    output = gr.HTML()
+
+    current_test_set = gr.State(test_set_names[0])
+
+    def update_test_set(test_set):
+        data_openmath2, data_math_train = load_test_sets(f"{test_set}.jsonl")
+        warning = ""
+        warning_visible = False
+        if test_set == "omni-math":
+            warning = "⚠️ Since Omni-Math benchmarks was released after we finished training of our models, we didn't perform decontamination with it and some of the problems might match exactly!"
+            warning_visible = True
+        return (
+            0,
+            display_entry(0, test_set),
+            warning,
+            gr.update(visible=warning_visible),
+            test_set,
+            gr.update(maximum=len(data_openmath2) - 1),  # Update the maximum allowed index
+        )
+
+    def display_entry_wrapper(index, current_test_set):
+        data_openmath2, _ = load_test_sets(f"{current_test_set}.jsonl")
+        # Ensure the index is within bounds
+        index = max(0, min(int(index), len(data_openmath2) - 1))
+        return display_entry(index, current_test_set)
+
+    def random_entry_wrapper(current_test_set):
+        data_openmath2, _ = load_test_sets(f"{current_test_set}.jsonl")
+        return random_entry(data_openmath2)
+
+    test_set_dropdown.change(
+        update_test_set,
+        inputs=[test_set_dropdown],
+        outputs=[
+            index_input,
+            output,
+            warning_box,
+            warning_box,
+            current_test_set,
+            index_input,
+        ],
+    )
+    index_input.change(display_entry_wrapper, inputs=[index_input, current_test_set], outputs=output)
+    random_button.click(random_entry_wrapper, inputs=[current_test_set], outputs=index_input)
+
+    demo.load(display_entry_wrapper, inputs=[index_input, current_test_set], outputs=output)
+
+demo.launch(debug=False, server_name='0.0.0.0', server_port=5005)
--- a/dockerfiles/Dockerfile.megatron
+++ b/dockerfiles/Dockerfile.megatron
+FROM nvcr.io/nvidia/pytorch:25.04-py3
+
+# Set working directory
+WORKDIR /opt
+
+# Install megatron-lm
+ENV MEGATRON_COMMIT=dfc0a3d004391a82d8d8a5a6d991b65eaed0190c
+RUN git clone https://github.com/NVIDIA/Megatron-LM && \
+    cd Megatron-LM && \
+    git checkout $MEGATRON_COMMIT && \
+    pip install -e .
+
+# installing libs for hf -> megatron conversion
+RUN pip install transformers accelerate
+
+# fix for https://github.com/NVIDIA/NeMo/issues/12836
+# there is a global requirements lock that we need to remove..
+RUN rm /etc/pip/constraint.txt && touch /etc/pip/constraint.txt
+RUN pip install -U "nvidia-modelopt[all]>=0.27"
+
+ENV PYTHONPATH=/opt/Megatron-LM
\ No newline at end of file
--- a/dockerfiles/Dockerfile.nemo
+++ b/dockerfiles/Dockerfile.nemo
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# copied from https://github.com/NVIDIA/NeMo-Aligner/blob/main/Dockerfile
+# with pinned NeMo-Aligner version for reproducibility
+
+# To build NeMo-Aligner from a base PyTorch container:
+#
+#   docker buildx build -t aligner:latest .
+#
+# To update NeMo-Aligner from a pre-built NeMo-Framework container:
+#
+#   docker buildx build --target=aligner-bump -t aligner:latest .
+#
+
+# Number of parallel threads for compute heavy build jobs
+# if you get errors building TE or Apex, decrease this to 4
+ARG MAX_JOBS=8
+# Git refs for dependencies
+ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
+ARG PYTRITON_VERSION=0.5.10
+ARG NEMO_TAG=19668e5320a2e2af0199b6d5e0b841993be3a634  # On: main
+ARG MLM_TAG=25059d3bbf68be0751800f3644731df12a88f3f3   # On: main
+ARG ALIGNER_COMMIT=35fcfd9df754aff56f71cb3ba3382cc02384361a
+ARG TRTLLM_VERSION=v0.13.0
+ARG PROTOBUF_VERSION=4.24.4
+ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3
+
+FROM ${BASE_IMAGE} AS aligner-bump
+ARG ALIGNER_COMMIT
+WORKDIR /opt
+# NeMo Aligner
+RUN <<"EOF" bash -exu
+if [[ ! -d NeMo-Aligner ]]; then
+    git clone https://github.com/NVIDIA/NeMo-Aligner.git
+fi
+cd NeMo-Aligner
+git fetch origin '+refs/pull/*/merge:refs/remotes/pull/*/merge'
+git checkout -f $ALIGNER_COMMIT
+# case 1: ALIGNER_COMMIT is a local branch so we have to apply remote changes to it
+# case 2: ALIGNER_COMMIT is a commit, so git-pull is expected to fail
+git pull --rebase || true
+
+pip install --no-cache-dir --no-deps -e .
+EOF
+
+FROM ${BASE_IMAGE} as final
+LABEL "nemo.library"="nemo-aligner"
+WORKDIR /opt
+# needed in case git complains that it can't detect a valid email, this email is fake but works
+RUN git config --global user.email "worker@nvidia.com"
+# install latest apex
+ARG APEX_TAG
+RUN pip uninstall -y apex && \
+    git clone https://github.com/NVIDIA/apex && \
+    cd apex && \
+    if [ ! -z $APEX_TAG ]; then \
+        git fetch origin $APEX_TAG && \
+        git checkout FETCH_HEAD; \
+    fi && \
+    pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./
+
+# Git LFS
+RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \
+    apt-get install git-lfs && \
+    git lfs install && \
+    apt-get clean
+
+# TRTLLM
+ARG TRTLLM_VERSION
+RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git && \
+    cd TensorRT-LLM && \
+    git checkout ${TRTLLM_VERSION} && \
+    . docker/common/install_tensorrt.sh && \
+    python3 ./scripts/build_wheel.py --job_count $(nproc) --trt_root /usr/local/tensorrt  --python_bindings --benchmarks && \
+    pip install -e .
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/
+
+# install TransformerEngine
+ARG MAX_JOBS
+ARG TE_TAG
+RUN pip uninstall -y transformer-engine && \
+    git clone https://github.com/NVIDIA/TransformerEngine.git && \
+    cd TransformerEngine && \
+    if [ ! -z $TE_TAG ]; then \
+        git fetch origin $TE_TAG && \
+        git checkout FETCH_HEAD; \
+    fi && \
+    git submodule init && git submodule update && \
+    NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .
+
+RUN pip install fire
+    
+# place any util pkgs here
+ARG PYTRITON_VERSION
+RUN pip install --upgrade-strategy only-if-needed nvidia-pytriton==$PYTRITON_VERSION
+ARG PROTOBUF_VERSION
+RUN pip install -U --no-deps protobuf==$PROTOBUF_VERSION
+RUN pip install --upgrade-strategy only-if-needed jsonlines
+
+# NeMo
+ARG NEMO_TAG
+RUN git clone https://github.com/NVIDIA/NeMo.git && \
+    cd NeMo && \
+    git pull && \
+    if [ ! -z $NEMO_TAG ]; then \
+        git fetch origin $NEMO_TAG && \
+        git checkout FETCH_HEAD; \
+    fi && \
+    pip uninstall -y nemo_toolkit sacrebleu && \
+    pip install -e ".[nlp]" && \
+    cd nemo/collections/nlp/data/language_modeling/megatron && make
+
+# MLM
+ARG MLM_TAG
+RUN pip uninstall -y megatron-core && \
+    git clone https://github.com/NVIDIA/Megatron-LM.git && \
+    cd Megatron-LM && \
+    git pull && \
+    if [ ! -z $MLM_TAG ]; then \
+        git fetch origin $MLM_TAG && \
+        git checkout FETCH_HEAD; \
+    fi && \
+    pip install -e .
+
+COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner
+RUN cd /opt/NeMo-Aligner && \
+    pip install --no-deps -e .
+
+RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch
+
+# TODO(terryk): This layer should be deleted ASAP after NeMo is bumped to include all of these PRs
+RUN <<"EOF" bash -exu
+cd NeMo
+# Ensures we don't cherry-pick "future" origin/main commits
+git fetch -a
+# 0c92fe17df4642ffc33d5d8c0c83fda729e3910c: [fix] Ensures disabling exp_manager with exp_manager=null does not error NeMo#10651
+# 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652
+# 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863
+# (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654
+for pr_and_commit in \
+  "10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \
+  "10652 60e677423667c029dd05875da72bf0719774f844" \
+  "10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \
+; do
+  pr=$(cut -f1 -d' ' <<<"$pr_and_commit")
+  head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit")
+  git fetch origin $head_pr_commit:PR-${pr}
+  # cherry-picks all commits between main and the top of the PR
+  git cherry-pick --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr}
+  # Tag cherry-picks to help
+  git tag cherry-pick-PR-${pr}
+done
+EOF
+
+# patching gpt sft dataset to properly support packing
+# TODO: remove when integrated in NeMo
+COPY nemo_skills/training/gpt_sft_dataset.py /opt/NeMo/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
--- a/dockerfiles/Dockerfile.nemo-rl
+++ b/dockerfiles/Dockerfile.nemo-rl
+# copied and edited from https://github.com/NVIDIA/NeMo-RL/blob/main/docker/Dockerfile
+ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.05-cuda12.9-devel-ubuntu24.04
+FROM ${BASE_IMAGE} AS base
+
+# It is more convenient for users to run as root
+USER root
+
+RUN <<"EOF" bash -exu -o pipefail
+export DEBIAN_FRONTEND=noninteractive
+export TZ=America/Los_Angeles
+
+apt-get update
+apt-get install -y --no-install-recommends \
+    jq \
+    curl \
+    git \
+    rsync \
+    wget \
+    less \
+    vim \
+
+# Nsight
+apt install -y --no-install-recommends gnupg
+echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu$(source /etc/lsb-release; echo "$DISTRIB_RELEASE" | tr -d .)/$(dpkg --print-architecture) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list
+apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
+apt update
+apt install -y nsight-systems-cli
+
+
+apt-get clean
+rm -rf /var/lib/apt/lists/*
+EOF
+
+# Install uv and python
+ARG UV_VERSION=0.7.2
+ARG PYTHON_VERSION=3.12
+ENV PATH="/root/.local/bin:$PATH"
+RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh && \
+    uv python install ${PYTHON_VERSION}
+
+# Disable usage stats by default for users who are sensitive to sharing usage.
+# Users are encouraged to enable if the wish.
+ENV RAY_USAGE_STATS_ENABLED=0
+ENV NEMO_RL_VENV_DIR=/opt/ray_venvs
+
+
+FROM base AS hermetic
+
+ARG NEMO_RL_COMMIT
+ENV NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-51f8b2672535e30a588f988ce65871442c5109df}
+
+RUN git clone https://github.com/NVIDIA/NeMo-RL.git /opt/NeMo-RL && cd /opt/NeMo-RL && git checkout ${NEMO_RL_COMMIT}
+
+WORKDIR /opt/NeMo-RL
+
+# Variables to control the build of TE. If there are issues with parallelization, consider
+# setting these to 1.
+ARG MAX_JOBS=4
+ARG NVTE_BUILD_THREADS_PER_JOB=1
+
+ENV UV_PROJECT_ENVIRONMENT=/opt/nemo_rl_venv
+ENV UV_LINK_MODE=copy
+
+# Create and activate virtual environment
+RUN <<"EOF" bash -exu
+uv venv ${UV_PROJECT_ENVIRONMENT}
+# uv sync has a more reliable resolver than simple uv pip install which can fail
+
+# Sync each training + inference backend one at a time (since they may conflict)
+# to warm the uv cache, then at the end just sync the default dependencies.
+# Do everything in one layer to prevent large layers.
+
+# The venv is symlinked to avoid bloating the layer size
+uv sync --link-mode symlink --locked --no-install-project
+uv sync --link-mode symlink --locked --extra vllm --no-install-project
+uv sync --link-mode symlink --locked --extra mcore --no-install-project
+uv sync --link-mode symlink --locked --all-groups --no-install-project
+EOF
+
+ENV PATH="/opt/nemo_rl_venv/bin:$PATH"
+ENV NEMO_RL_VENV_DIR=/opt/ray_venvs
+
+# Prefetch all virtual environments
+# Copy entire source to temp location, run prefetch, then clean up
+RUN cp -r /opt/NeMo-RL /tmp/nemo-rl-prefetch && cd /tmp/nemo-rl-prefetch && \
+    UV_PROJECT_ENVIRONMENT="/tmp/nemo-rl-prefetch/.venv" uv run nemo_rl/utils/prefetch_venvs.py && \
+    cd / && \
+    rm -rf /tmp/nemo-rl-prefetch
+
+RUN git clone https://github.com/NVIDIA/NeMo-Skills.git /opt/NeMo-Skills && cd /opt/NeMo-Skills && uv pip install .
--- a/dockerfiles/Dockerfile.nemo-skills
+++ b/dockerfiles/Dockerfile.nemo-skills
+FROM python:3.10
+
+RUN apt-get update && apt-get -y install curl git git-lfs
+
+# for ifeval benchmark
+# TODO: can we get just a single dir?
+RUN mkdir /opt/benchmarks
+RUN git clone https://github.com/google-research/google-research.git /opt/benchmarks/google-research --depth=1
+
+# ifbench
+RUN git clone https://github.com/allenai/IFBench.git /opt/benchmarks/IFBench --depth=1
+RUN cd /opt/benchmarks/IFBench && sed -i '/^unicodedata[=<>]*.*$/d' requirements.txt && pip install -r requirements.txt
+RUN cd /opt/benchmarks
+
+RUN pip install langdetect absl-py immutabledict nltk ipython && \
+    python -c "import nltk; nltk.download('punkt'); nltk.download('punkt_tab')"
+
+RUN git clone https://github.com/ShishirPatil/gorilla.git /opt/gorilla
+RUN cd /opt/gorilla && git checkout d2177992bbba9aa228b53c0645bf8f5613a5a7c6
+RUN cd /opt/gorilla/berkeley-function-call-leaderboard && pip install -e .
+
+# installing apptainer
+RUN apt install -y wget && \
+    cd /tmp && \
+    wget https://github.com/apptainer/apptainer/releases/download/v1.4.1/apptainer_1.4.1_amd64.deb && \
+    apt install -y ./apptainer_1.4.1_amd64.deb
+
+RUN mkdir -p /opt/NeMo-Skills/requirements
+COPY pyproject.toml README.md /opt/NeMo-Skills/
+COPY nemo_skills /opt/NeMo-Skills/nemo_skills/
+COPY requirements /opt/NeMo-Skills/requirements/
+RUN cd /opt/NeMo-Skills && pip install -e .[all]
\ No newline at end of file
--- a/dockerfiles/Dockerfile.sandbox
+++ b/dockerfiles/Dockerfile.sandbox
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Use the base image with Python 3.10 and Flask
+FROM tiangolo/uwsgi-nginx-flask:python3.10
+
+# Install dependencies required for Lean 4 and other tools
+RUN apt-get update && \
+    apt-get install -y curl git bzip2 && \
+    curl -L https://downloads.python.org/pypy/pypy3.10-v7.3.17-linux64.tar.bz2 -o /tmp/pypy.tar.bz2 && \
+    tar -xjf /tmp/pypy.tar.bz2 -C /opt/ && \
+    ln -s /opt/pypy3.10-v7.3.17-linux64/bin/pypy3 /usr/local/bin/pypy3 && \
+    rm /tmp/pypy.tar.bz2
+
+RUN curl https://raw.githubusercontent.com/leanprover/elan/master/elan-init.sh -sSf | sh -s -- -y && \
+    /root/.elan/bin/elan toolchain install leanprover/lean4:v4.12.0 && \
+    /root/.elan/bin/elan default leanprover/lean4:v4.12.0 && \
+    /root/.elan/bin/elan self update
+
+# Set environment variables to include Lean and elan/lake in the PATH
+ENV PATH="/root/.elan/bin:$PATH"
+
+# Create Lean project directory and initialize a new Lean project with Mathlib4
+RUN mkdir -p /lean4 && cd /lean4 && \
+    /root/.elan/bin/lake new my_project && \
+    cd my_project && \
+    echo 'leanprover/lean4:v4.12.0' > lean-toolchain && \
+    echo 'require mathlib from git "https://github.com/leanprover-community/mathlib4" @ "v4.12.0"' >> lakefile.lean
+
+# Download and cache Mathlib4 to avoid recompiling, then build the project
+RUN cd /lean4/my_project && \
+    /root/.elan/bin/lake exe cache get && \
+    /root/.elan/bin/lake build
+
+# Set environment variables to include Lean project path
+ENV LEAN_PATH="/lean4/my_project"
+ENV PATH="/lean4/my_project:$PATH"
+
+# Set up application code and install Python dependencies
+COPY requirements/code_execution.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r /app/requirements.txt
+COPY nemo_skills/code_execution/local_sandbox/local_sandbox_server.py /app/main.py
+
+# For scicode eval
+RUN mkdir /data && pip install gdown && \
+    python -c "import gdown; url = f'https://drive.google.com/uc?id=17G_k65N_6yFFZ2O-jQH00Lh6iaw3z-AW'; gdown.download(url, '/data/test_data.h5', quiet=False)"
+
+# Set the working directory to /app
+WORKDIR /app
+
+# Set Flask app environment variables and ports
+ARG UWSGI_CHEAPER
+ENV UWSGI_CHEAPER=$UWSGI_CHEAPER
+
+ARG UWSGI_PROCESSES
+ENV UWSGI_PROCESSES=$UWSGI_PROCESSES
+
+ENV LISTEN_PORT=6000
--- a/dockerfiles/Dockerfile.sglang
+++ b/dockerfiles/Dockerfile.sglang
+FROM lmsysorg/sglang:v0.4.10.post2-cu126
+
+# patching for sharding states support for DeepSeek-R1
+COPY dockerfiles/sglang.patch /sgl-workspace/sglang.patch
+RUN cd /sgl-workspace/sglang && git apply /sgl-workspace/sglang.patch
\ No newline at end of file
--- a/dockerfiles/Dockerfile.verl
+++ b/dockerfiles/Dockerfile.verl
+FROM whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6-mcore0.12.0-te2.3
+# Set working directory
+WORKDIR /opt
+
+# Install verl
+ENV VERL_COMMIT=2ed63bbf39c22724e4940d97e4b09e4f3e5f6d68
+RUN git clone https://github.com/volcengine/verl.git && \
+    cd verl && \
+    git checkout ${VERL_COMMIT} && \
+    pip3 install -e .
+
+RUN pip install fire
+RUN pip3 install -U pynvml
+
+
+WORKDIR /workspace
+
+# Fix CV2
+RUN pip install opencv-fixer==0.2.5 && \
+    python -c "from opencv_fixer import AutoFix; AutoFix()"
+
+# Run additional dependencies
+RUN pip install math-verify[antlr4_9_3] ray[default] pylatexenc wandb
+
+CMD ["/usr/bin/bash"]
\ No newline at end of file
--- a/dockerfiles/Dockerfile.vllm
+++ b/dockerfiles/Dockerfile.vllm
+FROM vllm/vllm-openai:v0.10.0
+
+# adding editable vllm installation to allow overriding python code with a custom mount
+RUN cd /opt && git clone https://github.com/vllm-project/vllm.git && cd vllm && git checkout v0.10.0 && VLLM_USE_PRECOMPILED=1 pip install -e .
--- a/dockerfiles/README.md
+++ b/dockerfiles/README.md
+# How to build all necessary dockerfiles
+
+Some dockerfiles are directly included in this folder and for some others the instructions to build them are below.
+To build one of the existing dockerfiles use a command like this
+
+```
+docker build -t igitman/nemo-skills-nemo:0.6.1 -f dockerfiles/Dockerfile.nemo .
+```
+It might take a long time for some of the images.
+
+## Building trtllm image
+
+We directly use official TensorRT-LLM ngc containers. Current version is `nvcr.io/nvidia/tensorrt-llm/release:0.21.0`.
--- a/dockerfiles/sglang.patch
+++ b/dockerfiles/sglang.patch
+diff --git a/python/sglang/srt/model_loader/loader.py b/python/sglang/srt/model_loader/loader.py
+index e2c6a37..4ee6347 100644
+--- a/python/sglang/srt/model_loader/loader.py
+++ b/python/sglang/srt/model_loader/loader.py
+@@ -653,6 +653,11 @@ class ShardedStateLoader(BaseModelLoader):
+                         state_dict.pop(key)
+             if state_dict:
+                 raise ValueError(f"Missing keys {tuple(state_dict)} in loaded state!")
+
+        if hasattr(model, "post_load_weights"):
+            print("Post loading weights")
+            model.post_load_weights()
+
+         return model.eval()
+ 
+     @staticmethod
--- a/docs/assets/chat_interface_demo.gif
+++ b/docs/assets/chat_interface_demo.gif
--- a/docs/basics/chat_interface.md
+++ b/docs/basics/chat_interface.md
+# Chat Interface
+
+The chat interface provides a web UI where you can interactively chat with a deployed model. It supports features like multi-turn conversations and, for certain models like [OpenMath-Nemotron](https://huggingface.co/collections/nvidia/openmathreasoning-68072c0154a5099573d2e730), code execution capabilities.
+
+![Chat Interface Demo](../assets/chat_interface_demo.gif)
+
+## Launching
+
+There are two main ways to launch the chat interface:
+### 1. Via `ns start_server`
+
+You can launch the chat interface alongside the model server directly on a cluster or remote machine using the `ns start_server` command:
+
+```bash
+ns start_server \
+    --model Qwen/Qwen3-8B \
+    --server_type vllm \
+    --server_gpus 1 \
+    --config local \
+    --launch_chat_interface \
+    [--extra_chat_args "<hydra_options_for_chat_ui>"]
+```
+
+### 2. Manual Launch
+
+Alternatively, you can launch the chat interface manually if you have the `nemo_skills` environment installed locally. This method is suitable when you want to connect to an already running model server.
+
+```bash
+python -m nemo_skills.inference.chat_interface.launch server_type=vllm [other_hydra_options]
+```
+Replace `MODEL_CONFIG` with the path to your model's configuration (e.g., `model_config_path=/path/to/model/config.json`) and `SERVER_TYPE` with the type of server you are connecting to (e.g., `server_type=vllm`).
+
+All relevant parameters for the chat interface, such as the model details, server endpoint, and UI elements, can be configured via Hydra command-line arguments. For a comprehensive list of configurable parameters, please refer to the configuration schema in `nemo_skills/inference/chat_interface/core.py`.
+
+
+When launched this way, the chat interface will run on the same node as the model server.
+
+#### Accessing the Interface (Cluster/Remote Launch)
+
+To access the chat interface when it's launched via `ns start_server` on a remote machine or cluster, you'll need to set up an SSH tunnel to forward the port (default is `7860`) from the remote machine to your local machine.
+
+*   **For Slurm clusters:**
+    Use the following command, replacing `cluster` with the slurm cluster hostname or IP address, `username` with your username, and `node-name` with the name of the node where the server is running:
+    ```bash
+    ssh -J cluster -N -f -L localhost:7860:localhost:7860 username@node-name
+    ```
+
+*   **For remote workstations/servers:**
+    Use the following command, replacing `username` with your username and `server` with the hostname or IP address of the remote machine:
+    ```bash
+    ssh -N -f -L localhost:7860:localhost:7860 username@server
+    ```
+
+Once the tunnel is established, you can access the interface by navigating to `http://localhost:7860` in your web browser.
--- a/docs/basics/cluster-configs.md
+++ b/docs/basics/cluster-configs.md
+# Cluster configs
+
+All of the [pipeline scripts](../pipelines/index.md) accept `--cluster` argument which you can use
+to control where the job gets executed (you need a "local" cluster config to run jobs locally as well).
+That argument picks up one of the configs inside your local
+[cluster_configs](https://github.com/NVIDIA/NeMo-Skills/tree/main/cluster_configs)
+folder by default, but you can specify another location with `--config_dir` or set it in `NEMO_SKILLS_CONFIG_DIR` env variable.
+You can also use `NEMO_SKILLS_CONFIG` env variable instead of the `--cluster` parameter.
+The cluster config defines an executor (local or slurm), mounts for data/model access and (slurm-only) various parameters
+such as account, partition, ssh-tunnel arguments and so on.
+
+The recommended way to launch jobs on slurm is by running all commands locally and specifying `ssh_tunnel` portion in cluster config
+to let [NeMo-Run](https://github.com/NVIDIA/NeMo-Run) know how to connect there.
+But if you prefer to run from the cluster directly, you can install NeMo-Skills there
+and then only specify `job_dir` parameter without using `ssh_tunnel` section in the config.
+
+You can see example configs in [cluster_configs](https://github.com/NVIDIA/NeMo-Skills/tree/main/cluster_configs) folder.
+To create a new config you can either rename and modify one of the examples or run
+
+```bash
+ns setup
+```
+
+that will help to create all necessary configs step-by-step.
+
+## Environment variables
+
+You can define environment variables in the cluster config file, which will be set inside the container.
+
+```yaml
+env_vars:
+  - MYENVVAR  # will pick the value from env
+  - MYENVVAR2=my_value  # will use my_value
+```
+
+If an environment variable is required, and you want us to fail if it's not provided,
+you can use `required_env_vars` instead. One thing to note is that `required_env_vars` does not support
+passing values directly, so you must provide them via environment variable only.
+
+
+Depending on which pipelines you run, you might need to define the following environment variables
+
+``` bash
+# only needed for training (can opt-out with --disable_wandb)
+export WANDB_API_KEY=...
+# only needed if using gated models, like llama3.1
+export HF_TOKEN=...
+# only needed if running inference with OpenAI models
+export OPENAI_API_KEY=...
+# only needed if running inference with Azure OpenAI models
+export AZURE_OPENAI_API_KEY=...
+# only needed if running inference with Nvidia NIM models
+export NVIDIA_API_KEY=...
+```
+
+
+## Useful tips
+
+Here are some suggestions on what can be defined in cluster configs for different use-cases
+
+1. Set `HUGGINGFACE_HUB_CACHE` environment variable to ensure all HuggingFace downloads are cached
+
+2. If you want to have a custom version of one of the underlying libraries that we use
+   (e.g. [NeMo](https://github.com/NVIDIA/NeMo) or [verl](https://github.com/volcengine/verl)),
+   you can clone it locally (or on cluster if using slurm), make your changes and then override in the container with
+
+      ```yaml
+      mounts:
+         - <your path>/NeMo:/opt/NeMo
+         - <your path>/verl:/opt/verl
+      ```
+
+3. You can specify custom containers - our code should work out-of-the-box or with very little changes with different
+   versions of inference libraries (e.g. [vLLM](https://github.com/vllm-project/vllm)) or training libraries
+   (e.g. [NeMo](https://github.com/NVIDIA/NeMo)). If you get some errors, you might also need to modify the entry-point
+   scripts we use, e.g.
+   [nemo_skills/inference/server/serve_vllm.py](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/inference/server/serve_vllm.py)
+   or [nemo_skills/training/start_sft.py](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/training/start_sft.py)
+
+4. For slurm clusters it's recommended to [build .sqsh files](https://github.com/NVIDIA/enroot/blob/master/doc/cmd/import.md#example)
+   for all containers and reference the cluster path
--- a/docs/basics/code-packaging.md
+++ b/docs/basics/code-packaging.md
+# Code packaging
+
+We use [NeMo-Run](https://github.com/NVIDIA/NeMo-Run) for managing our experiments with local and slurm-based
+execution supported (please open an issue if you need to run our code on other kinds of clusters).
+This means that even if you need to submit jobs on slurm, you can do it from your local machine by defining an
+appropriate cluster config and nemo-run will package and upload your code, data and manage
+all complexities of slurm scheduling. Check their documentation to learn how to fetch logs, check status,
+cancel jobs, etc.
+
+To decide which code to package we use the following logic:
+
+1. If you run commands from inside a cloned NeMo-Skills repository, we will package that repository.
+2. If you run commands from inside a git repository which is not NeMo-Skills (doesn't have `nemo_skills` top-level folder),
+   we will package your current repository and also include `nemo_skills` subfolder from its installed location.
+3. If you run commands from outside of any git repository, we will only package `nemo_skills` subfolder from its installed
+   location.
+
+Put simply, we will always include `nemo_skills` and will additionally include your personal git repository if you're
+running commands from it.
+
+!!! note
+
+    When packaging a git repository, NeMo-Run will only package the code tracked by git
+    (as well as all jsonl files from `nemo_skills/dataset`).
+    Any non-tracked files will not be automatically available inside the container or uploaded to slurm.
+
+    When packaging `nemo_skills` from its installed location (which might not be a git repository), we will
+    upload **all** the files inside `nemo_skills` subfolder. Make sure you do not store any large files there
+    to avoid uploading them on the cluster with each experiment!
+
+!!! note
+    
+    When you run commands from a git repo with uncommitted changes, NeMo-Run throws the following error
+    ```
+    RuntimeError: Your repo has uncommitted changes. Please commit your changes or set check_uncommitted_changes to False to proceed with packaging.
+    ```
+    This error can be avoided by either taking care of the uncommitted changes (via commit/revert), or setting the environment variable: 
+    ```bash
+    export NEMO_SKILLS_DISABLE_UNCOMMITTED_CHANGES_CHECK=1
+    ```
+    In all cases, uncommitted code will not be used. 
+
+
+Finally, it's important to keep in mind that whenever you submit a new experiment, NeMo-Run will create a copy of your
+code package both locally (inside `~/.nemo_run`) and on cluster (inside `ssh_tunnel/job_dir` path in your cluster config).
+If you submit multiple experiments from the same Python script, they will all share code, so only one copy will be
+created per run of that script. Even so, at some point, the code copies will be accumulated and you will run out of
+space both locally and on cluster. There is currently no automatic cleaning, so you have to monitor for that and
+periodically remove local and cluster nemo-run folders to free up space. There is no side effect of doing that (they will
+be automatically recreated) as long as you don't have any running jobs when you remove the folders.
+If you want to have more fine-grained control over code reuse, you can directly specify `--reuse_code_exp` argument when submitting jobs
+
+While our job submission is somewhat complicated and goes through NeMo-Run, at the end, we simply execute a particular sbatch file
+that is uploaded to the cluster. It is helpful sometimes to see what's in it and modify directly. You can find sbatch file(s)
+for each job inside `ssh_tunnel.job_dir` cluster folder that is defined in your cluster config.
--- a/docs/basics/index.md
+++ b/docs/basics/index.md
--- a/docs/basics/inference.md
+++ b/docs/basics/inference.md
+# Inference
+
+Here are the instructions on how to run inference with our repo.
+
+## Download/convert the model
+
+Get the model you want to use. You can use any model that's supported by vLLM, sglang, TensorRT-LLM or Megatron.
+You can also use [Nvidia NIM API](https://www.nvidia.com/en-us/ai/) for models that are hosted there.
+
+## Start the server
+
+Start the server hosting your model. Here is an example (make sure the `/hf_models` mount is defined in your cluster config). Skip this step if you want to use cloud models through an API.
+
+```bash
+ns start_server \
+    --cluster local \
+    --model /hf_models/Meta-Llama-3.1-8B-Instruct \
+    --server_type vllm \
+    --server_gpus 1 \
+    --server_nodes 1
+```
+
+If the model needs to execute code, add `--with_sandbox`
+
+You could also launch an interactive web chat application by adding `--launch_chat_interface`, for more details see the [Chat Interface documentation](chat_interface.md).
+
+## Send inference requests
+
+Click on :material-plus-circle: symbols in the snippet below to learn more details.
+
+
+=== "Self-hosted models"
+
+    ```python
+    from nemo_skills.inference.model import get_model
+    from nemo_skills.prompt.utils import get_prompt
+
+    llm = get_model(server_type="vllm")  # localhost by default
+    prompt = get_prompt('generic/default', 'llama3-instruct') # (1)!
+    prompt = prompt.fill({'question': "What's 2 + 2?"})
+    print(prompt) # (2)!
+    output = llm.generate_sync(prompt=prompt)
+    print(output["generation"]) # (3)!
+    ```
+
+    1.   Here we use [generic/default](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/config/generic/default.yaml) config
+         and [llama3-instruct](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/template/llama3-instruct.yaml) template.
+
+         See [nemo_skills/prompt](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt) for more config/template options
+         or [create your own prompts](prompt-format.md)
+
+
+    2.   This should print
+
+         ```python-console
+         >>> print(prompt)
+         <|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+         <|eot_id|><|start_header_id|>user<|end_header_id|>
+
+         What's 2 + 2?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+         ```
+
+         If you don't want to use our prompt class, just create this string yourself
+
+    3.   This should print
+         ```python-console
+         >>> print(output["generation"])
+         2 + 2 = 4.
+         ```
+
+=== "API models"
+
+    ```python
+    from nemo_skills.inference.model import get_model
+    from nemo_skills.prompt.utils import get_prompt
+
+    llm = get_model( # (1)!
+        server_type="openai",  # NIM models are using OpenAI API
+        base_url="https://integrate.api.nvidia.com/v1",
+        model="meta/llama-3.1-8b-instruct",
+    )
+    prompt = get_prompt('generic/default') # (2)!
+
+    prompt = prompt.fill({'question': "What's 2 + 2?"})
+
+    print(prompt) # (3)!
+    output = llm.generate_sync(prompt=prompt)
+    print(output["generation"]) # (4)!
+    ```
+
+    1.   Don't forget to define `NVIDIA_API_KEY`.
+
+         To use OpenAI models, use `OPENAI_API_KEY` and set `base_url=https://api.openai.com/v1`.
+
+    2.   Here we use [generic/default](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/config/generic/default.yaml) config.
+         Note that with API models we can't add special tokens, so prompt template is not specified.
+
+         See [nemo_skills/prompt](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt) for more config/template options
+         or [create your own prompts](prompt-format.md)
+
+
+    3.   This should print
+
+         ```python-console
+         >>> print(prompt)
+         [{'role': 'user', 'content': "What's 2 + 2?"}]
+         ```
+
+         If you don't want to use our prompt class, just create this list yourself
+
+    4.   This should print
+         ```python-console
+         >>> print(output["generation"])
+         2 + 2 = 4.
+         ```
+
+=== "With code execution"
+
+    ``` python
+    from nemo_skills.code_execution.sandbox import get_sandbox
+    from nemo_skills.inference.model import get_code_execution_model
+    from nemo_skills.prompt.utils import get_prompt
+
+    sandbox = get_sandbox()  # localhost by default
+    llm = get_code_execution_model(server_type="vllm", sandbox=sandbox)
+    prompt = get_prompt('generic/default', 'llama3-instruct', code_tags='llama3') # (1)!
+    prompt.config.system = ( # (2)!
+        "Environment: ipython\n\n"
+        "Use Python to solve this math problem."
+    )
+    prompt = prompt.fill({'question': "What's 2 + 2?"})
+    print(prompt) # (3)!
+    output = llm.generate_sync(prompt=prompt, **prompt.get_code_execution_args()) # (4)!
+    print(output["generation"]) # (5)!
+    ```
+
+    1.   Here we use [generic/default](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/config/generic/default.yaml) config
+         and [llama3-instruct](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/template/llama3-instruct.yaml) template.
+
+         Note how we are updating system message on the next line (you can also include it in the config directly).
+
+         See [nemo_skills/prompt](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt) for more config/template options
+         or [create your own prompts](prompt-format.md)
+
+    2.   8B model doesn't always follow these instructions, so using 70B or 405B for code execution is recommended.
+
+    3.   This should print
+
+         ```python-console
+         >>> print(prompt)
+         <|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+         Environment: ipython
+
+         Use Python to solve this math problem.<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+         What's 2 + 2?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+         ```
+
+         If you don't want to use our prompt class, just create this string yourself
+
+    4.   `prompt.get_code_execution_args()` simply returns a dictionary with start/stop tokens,
+         so that we know when to stop LLM generation and how to format the output.
+
+         If you don't want to use our prompt class, just define those parameters directly.
+
+    5.   This should print
+         ```python-console
+         >>> print(output["generation"])
+         <|python_tag|>print(2 + 2)<|eom_id|><|start_header_id|>ipython<|end_header_id|>
+
+         completed
+         [stdout]
+         4
+         [/stdout]<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+         The answer is 4.
+         ```
+
+         The "4" in the stdout is coming directly from Python interpreter running in the sandbox.
+
+Note that for self-hosted models we are explicitly adding all the special tokens before sending prompt to an LLM.
+This is necessary to retain flexibility. E.g. this way we can use base model format with
+instruct models that we found to work better with few-shot examples.
+
+You can learn more about how our prompt formatting works in the [prompt format docs](../basics/prompt-format.md).
+
+!!! note
+
+    You can also use slurm config when launching a server. If you do that, add `host=<slurm node hostname>`
+    to the `get_model/sandbox` calls and define `NEMO_SKILLS_SSH_KEY_PATH` and `NEMO_SKILLS_SSH_SERVER` env vars
+    to set the connection through ssh.
\ No newline at end of file
--- a/docs/basics/prompt-format.md
+++ b/docs/basics/prompt-format.md
+# Prompt utilities
+
+Our prompts are configured via three yaml files:
+
+1. **Prompt template** - defines model-specific chat format and special tokens
+2. **Prompt config** - contains the actual prompt text with placeholders  
+3. **Code tags** - specifies code formatting tokens, required for code execution
+
+
+## Prompt template
+
+The template file defines model-specific special tokens, e.g. bos, turn tokens,
+user/assistant/system message, etc. All of the
+templates that we support by default are available in
+[nemo_skills/prompt/template](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/template)
+folder. Here is an example template for
+[llama3-instruct](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/template/llama3-instruct.yaml) models:
+
+```yaml
+# Prompt specification for the original Llama3-instruct model
+
+# these tokens are always used to construct a prompt like this
+#
+#   single-turn:
+#     <text_begin><system_begin>{system}<system_end><user_begin>{user}<user_end><assistant_begin>{generation}
+#   multi-turn:
+#     <text_begin><system_begin>{system}<system_end><user_begin>{user1}<user_end><assistant_begin>{assistant1}<assistant_end>...
+#     <user_begin>{userN}<user_end><assistant_begin>{generation}
+
+text_begin: "<|begin_of_text|>"
+
+system_begin: "<|start_header_id|>system<|end_header_id|>\n\n"
+system_end: "<|eot_id|>"
+
+user_begin: "<|start_header_id|>user<|end_header_id|>\n\n"
+user_end: "<|eot_id|>"
+
+assistant_begin: "<|start_header_id|>assistant<|end_header_id|>\n\n"
+assistant_end: "<|eot_id|>"
+
+stop_phrases: ["<|eot_id|>"]
+```
+
+You can specify a particular template with `++prompt_template=...`. If you don't add a .yaml extension (e.g.
+`++prompt_template=llama3-instruct`), we assume you want to use one of the existing templates and will search
+in the included folder. If you provide a full path, we will take the file you specify instead.
+
+!!! note
+
+    If you're using OpenAI server type (models are hosted elsewhere), you cannot provide the template
+    as we cannot add any special tokens and have to send the user/assistant messages following the OpenAI API.
+    For self-hosted models with TensorRT-LLM, the template is required, but for other servers it's optional.
+
+## Prompt config
+
+The prompt config contains user and system messages with placeholders for keys from a data file.
+The configs are model independent (any model can be used with any config).
+All of the configs that we support by default are available in
+[nemo_skills/prompt/config](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/config)
+folder. Here is an example prompt for
+[math evaluations](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/config/generic/math.yaml):
+
+```yaml
+# default prompt for all math benchmarks (e.g. gsm8k, math)
+
+few_shot_examples:
+  prefix: "Here are some examples of problems and solutions you can refer to.\n\n"
+  template: "Problem:\n{problem}\n\nSolution:\n{solution}\n\n\n\n\n\n"
+  suffix: "Here is the problem you need to solve:\n"
+  # this is built as <prefix>{template.format(example1)}{template.format(example2)}...{template.format(exampleN)}<suffix>
+  # and available as {examples} key in the final prompt
+  # if examples_type is not specified, then {examples} will be empty
+  # by default there are no examples, but can be changed from code/cmd
+
+system: ""
+
+user: |-
+  Solve the following math problem. Make sure to put the answer (and only answer) inside \boxed{{}}.
+
+  {examples}{problem}
+```
+
+Note that we use `{problem}`, `{solution}` and `{examples}` format strings here. The `{examples}` is a special
+key that will be used to include few shot examples you specify above (it's empty unless you add `++examples_type` or
+specify it in the config like e.g. in
+[llama3-gsm8k prompt](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/config/generic/gsm8k.yaml)).
+All other keys will need to be specified when you call `prompt.fill`
+(more on that in the [prompt-api section](#prompt-api)) so that we can replace placeholders with actual input.
+
+The input for few shot examples always comes from one of the available example types in
+[here](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/few_shot_examples/__init__.py). E.g. in the
+[llama3-gnstruct/gsm8k](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/config/llama3-instruct/gsm8k.yaml)
+prompt the `gsm8k_standard_few_shot` examples from
+[here](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/few_shot_examples/gsm8k.py) are used.
+
+
+## Code tags
+
+Code tags define the special tokens that models use to mark executable code blocks and their output. Code tags are required when using code execution.
+All code tags that we support by default are available in
+[nemo_skills/prompt/code_tags](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/code_tags).
+
+Here is an example code tags file for the [llama3](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/code_tags/llama3.yaml) family:
+
+```yaml
+# Code tags for llama3 family models
+
+# used to execute code within these tags
+code_begin: "<|python_tag|>"
+code_end: "<|eom_id|>"
+
+# used to extract the code output
+code_output_begin: "<|start_header_id|>ipython<|end_header_id|>"
+code_output_end: "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
+
+# how to post-process the captured output (choices: llama, qwen)
+code_output_format: "llama"
+```
+
+## Prompt API
+
+If you're running one of the pipeline scripts, you can control the prompt by using:
+
+```bash
++prompt_template=...
++prompt_config=...
++code_tags=...
++examples_type=...
+```
+
+If you're implementing a new script, you can use the following code to create a prompt and then use it:
+
+```python
+from nemo_skills.prompt.utils import get_prompt
+
+# The third parameter is optional and only needed for code execution
+prompt = get_prompt('generic/math', 'llama3-instruct', code_tags='llama3')
+print(prompt.fill({'problem': "What's 2 + 2?"}))
+```
+
+which outputs
+
+```python-console
+<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+Solve the following math problem. Make sure to put the answer (and only answer) inside \boxed{}.
+
+What's 2 + 2?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+```
+
+Or if you want to skip the template and use OpenAI API
+
+```python
+from nemo_skills.prompt.utils import get_prompt
+
+prompt = get_prompt('generic/math')
+print(prompt.fill({'problem': "What's 2 + 2?"}))
+```
+
+which outputs
+
+```python-console
+[
+  {
+    'role': 'user',
+    'content': "Solve the following math problem. Make sure to put the answer (and only answer) inside \\boxed{}.\n\nWhat's 2 + 2?"
+  }
+]
+```
+
+You can also have a look at the [tests](https://github.com/NVIDIA/NeMo-Skills/tree/main/tests/test_prompts.py) to see more examples of using our prompt API.
+
+
+## Multi-turn prompts
+
+If your data is naturally multi-turn (e.g. user-assistant conversations), you can use a special parameter `multi_turn_key` to format
+all conversation together. It can be of any length, as long as each entry except last has a special `assistant` key. The prompt config
+will be applied on each list entry separately. Here is an example
+
+```python
+from nemo_skills.prompt.utils import get_prompt
+
+prompt = get_prompt('generic/default')
+data = {'turns': [{'question': "What's 2 + 2?", 'assistant': "easy, that's 5!"}, {'question': 'Can you double check?'}]}
+print(prompt.fill(data, multi_turn_key='turns'))
+```
+
+which outputs
+
+```python-console
+[
+  {
+    'role': 'user',
+    'content': "What's 2 + 2?"
+  },
+  {
+    'role': 'assistant',
+    'content': "easy, that's 5!"
+  },
+  {
+    'role': 'user',
+    'content': 'Can you double check?'
+  }
+]
+```
+
+or if using template
+
+```python
+from nemo_skills.prompt.utils import get_prompt
+
+prompt = get_prompt('generic/default', 'llama3-instruct')
+data = {'turns': [{'question': "What's 2 + 2?", 'assistant': "easy, that's 5!"}, {'question': 'Can you double check?'}]}
+print(prompt.fill(data, multi_turn_key='turns'))
+```
+
+which outputs
+
+```python-console
+<|begin_of_text|><|start_header_id|>system<|end_header_id|>
+
+<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+What's 2 + 2?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+
+easy, that's 5!<|eot_id|><|start_header_id|>user<|end_header_id|>
+
+Can you double check?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
+```
--- a/docs/basics/sandbox.md
+++ b/docs/basics/sandbox.md
+# Sandbox for code execution
+
+Our pipeline relies on Python interpreter to execute code generated by LLMs. This creates a security risk,
+since we are executing arbitrary code that we do not have full control over. To partially address this,
+we provide a basic sandbox that we use to execute code and validate the correctness of LLM-generated answers.
+
+## Local sandbox
+
+The default sandbox option used in our pipeline is a local docker container.
+Check out [nemo_skills/code_execution/local_sandbox](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/code_execution/local_sandbox)
+for implementation details.
+
+Please note that our provided sandbox is not fully secure and you are strongly encouraged to
+setup a properly configured virtual machine such that generated code executes in an unprivileged environment
+with no external network access unless necessary.
+
+## Piston sandbox
+
+A better alternative is to host a [Piston server](https://github.com/engineer-man/piston)
+in a properly configured VM. If you're using a Piston server (you need to host it yourself),
+add the following parameters to the relevant scripts
+
+```bash
++sandbox_type=piston
++sandbox.host=<where your server is hosted, e.g. https://emkc.org/api/v2/piston>
+```
+
+## Other sandboxes
+
+Our sandbox API makes no assumptions on where or how the code is executed, so it's very easy
+to extend it. E.g. you can use AWS Lambda functions or other similar offerings.
+Please open an issue if you'd like us to add support for another sandbox in the future.
\ No newline at end of file
--- a/docs/css/extra.css
+++ b/docs/css/extra.css
+/* Target only inline code */
+p code, li code, td code {
+  word-break: keep-all;
+  white-space: nowrap;
+}
+
+/* Preserve formatting for multi-line code blocks */
+pre code {
+  word-break: normal;
+  white-space: pre;
+}
+
+:root {
+  --md-tooltip-width: 600px;
+}
\ No newline at end of file
--- a/docs/favicon.ico
+++ b/docs/favicon.ico
--- a/docs/index.md
+++ b/docs/index.md
+---
+hide:
+  - navigation
+  - toc
+---
+
+[NeMo-Skills](https://github.com/NVIDIA/NeMo-Skills) is a collection of pipelines to improve "skills" of large language models (LLMs). We support everything needed for LLM development, from synthetic data generation, to model training, to evaluation on a wide range of benchmarks. Start developing on a local workstation and move to a large-scale Slurm cluster with just a one-line change.
+
+Here are some of the features we support:
+
+- [Flexible LLM inference](basics/inference.md):
+    - Seamlessly switch between API providers, local server and large-scale Slurm jobs for LLM inference.
+    - Host models (on 1 or many nodes) with [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [vLLM](https://github.com/vllm-project/vllm), [sglang](https://github.com/sgl-project/sglang) or [Megatron](https://github.com/NVIDIA/Megatron-LM).
+    - Scale SDG jobs from 1 GPU on a local machine all the way to tens of thousands of GPUs on a Slurm cluster.
+- [Model evaluation](pipelines/evaluation.md):
+    - Evaluate your models on many popular benchmarks.
+        - Math problem solving: hmmt_feb25, brumo25, aime24, aime25, omni-math (and many more)
+        - Formal proofs in Lean: minif2f, proofnet
+        - Coding skills: scicode, livecodebench, human-eval, mbpp
+        - Chat/instruction following: ifbench, ifeval, arena-hard
+        - General knowledge: mmlu, mmlu-pro, gpqa
+        - Long context: ruler
+    - Easily parallelize each evaluation across many Slurm jobs, self-host LLM judges, bring your own prompts or change benchmark configuration in any other way.
+- [Model training](pipelines/training.md): Train models using [NeMo-Aligner](https://github.com/NVIDIA/NeMo-Aligner/), [NeMo-RL](https://github.com/NVIDIA/NeMo-RL/) or [verl](https://github.com/volcengine/verl).
+
+
+To get started, follow these [steps](basics/index.md), browse available [pipelines](./pipelines/index.md) or run `ns --help` to see all available
+commands and their options.
+
+You can find more examples of how to use NeMo-Skills in the [tutorials](./tutorials/index.md) page.
+
+We've built and released many popular models and datasets using NeMo-Skills. See all of them in the [Papers & Releases](./releases/index.md) documentation.
\ No newline at end of file
--- a/docs/pipelines/checkpoint-conversion.md
+++ b/docs/pipelines/checkpoint-conversion.md
+# Checkpoint conversion
+
+!!! info
+
+    This pipeline starting script is [nemo_skills/pipeline/convert.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/pipeline/convert.py)
+
+    All extra parameters are passed to one of the following scripts
+
+    * For conversion to NeMo:
+        - If `--model_type=llama`: [nemo_skills/conversion/hf_to_nemo_llama.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/conversion/hf_to_nemo_llama.py)
+        - If `--model_type=qwen`: [nemo_skills/conversion/hf_to_nemo_qwen.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/conversion/hf_to_nemo_qwen.py)
+
+    * For conversion to HuggingFace:
+        - If `--model_type=llama`: [nemo_skills/conversion/nemo_to_hf_llama.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/conversion/nemo_to_hf_llama.py)
+        - If `--model_type=qwen`: [nemo_skills/conversion/nemo_to_hf_qwen.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/conversion/nemo_to_hf_qwen.py)
+
+
+You only need to convert models if you want to use NeMo-Aligner for training. All other server and training backends
+should work with HuggingFace format.
+
+To convert the checkpoint from one format to another use a command like this
+
+```bash
+ns convert \
+    --cluster=slurm \
+    --input_model=/hf_models/Meta-Llama-3.1-70B-Instruct \
+    --output_model=/trt_models/llama3.1-70b-instruct \
+    --convert_from=hf \
+    --convert_to=nemo \
+    --model_type=llama \
+    --num_gpus=8 \
+    --hf_model_name=meta-llama/Meta-Llama-3.1-70B-Instruct
+```
+
+You can provide any extra arguments that will be passed directly to the underlying conversion scripts.
+
--- a/docs/pipelines/decontamination.md
+++ b/docs/pipelines/decontamination.md
+# LLM-based data decontamination
+
+!!! info
+
+    This pipeline starting script is [nemo_skills/pipeline/generate.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/pipeline/generate.py)
+
+    All extra parameters are passed to [nemo_skills/inference/check_contamination.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/inference/check_contamination.py)
+
+We implemented an LLM-based data decontamination pipeline following
+[lmsys methodology](https://lmsys.org/blog/2023-11-14-llm-decontaminator/).
+
+There are two main ways how you can use this pipeline: to check existing dataset
+for contamination and to decontaminate the training dataset by removing all
+contaminated questions.
+
+## To check for contamination
+
+Let's say you want to check for contamination of [MATH](https://github.com/hendrycks/math)
+training set with MATH, AMC-23 and AIME-24 test sets. First, get the data
+
+```bash
+ns prepare_data math amc23 aime24
+```
+
+Then we need to retrieve top-k similar questions from the training set. Assuming
+you have `/workspace` defined in your [cluster config](../basics/cluster-configs.md)
+you can do it in the following way
+
+```python
+from nemo_skills.pipeline.cli import wrap_arguments, run_cmd, generate
+
+
+test_sets = ['math', 'amc23', 'aime24']
+compare_to = ",".join(f"/nemo_run/code/nemo_skills/dataset/{test_set}/test.jsonl" for test_set in test_sets)
+
+cmd = (
+    f"python -m nemo_skills.inference.retrieve_similar "
+    f"    ++retrieve_from='/nemo_run/code/nemo_skills/dataset/math/train.jsonl' "
+    f"    ++compare_to=\\\'{compare_to}\\\'"
+    f"    ++output_file='/workspace/math-contamination-retrieved.jsonl' "
+    f"    ++top_k=1 "
+)
+
+run_cmd(
+    cluster="local",
+    container="nemo",
+    num_gpus=1,  # can increase this if you have more gpus
+    ctx=wrap_arguments(cmd),
+)
+```
+
+Next, you need to run LLM inference to check those closest found questions from the output file. Here is an example
+using Llama-405B from Nvidia API catalog, but you can replace it with OpenAI models or self-hosted models.
+
+```python
+generate(
+    cluster="local",
+    generation_type="check_contamination",
+    input_file="/workspace/math-contamination-retrieved.jsonl",
+    output_dir="/workspace/math-contamination-results",
+    model="meta/llama-3.1-405b-instruct",
+    server_type="openai",
+    server_address="https://integrate.api.nvidia.com/v1",
+)
+```
+
+This script will print an output that looks like this
+
+```
+Contamination portion: 13.91% (705/5070)
+```
+
+## To decontaminate training data
+
+If you want instead to clean your training data from contaminated examples all the commands stay the same, but
+you need to swap values for the `retrieve_from` and `compare_to` arguments in the `retrieve_similar` step
+since we now want to make a check for each training set example and find closest test set problems.
+
+After you get `/workspace/math-contamination-results/output.jsonl`,
+you can pass it into [prepare_data command](training.md#preparing-the-data)
+with `++contamination_file=...` option.
+
+See a more detailed example in [OpenMathInstruct-2 dataset construction pipeline](../releases/openmathinstruct2/dataset.md#decontamination).
\ No newline at end of file
--- a/docs/pipelines/evaluation.md
+++ b/docs/pipelines/evaluation.md
+# Model evaluation
+
+!!! info
+
+    This pipeline starting script is [nemo_skills/pipeline/eval.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/pipeline/eval.py)
+
+    All extra parameters are passed to [nemo_skills/inference/generate.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/inference/generate.py)
+
+
+We support many popular benchmarks and it's easy to add new in the future. E.g. we support
+
+- Math problem solving: hmmt_feb25, brumo25, aime24, aime25, omni-math (and many more)
+- Formal proofs in Lean: minif2f, proofnet
+- Coding skills: livecodebench, human-eval, mbpp
+- Chat/instruction following: ifeval, arena-hard
+- General knowledge: mmlu, mmlu-pro, gpqa
+- Long context: ruler
+
+See [nemo_skills/dataset](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset) where each folder is a benchmark we support.
+
+Here is how to run evaluation (using API model as an example,
+but same command works with self-hosted models both locally and on slurm).
+Make sure that `/workspace` is mounted inside of your
+[cluster config](../basics/cluster-configs.md).
+
+## Preparing data
+
+You need to run the following commands to prepare the data.
+
+```bash
+ns prepare_data
+```
+
+If you're only interested in a subset of datasets (e.g. only math-related or code-related), run with
+`--dataset_groups ...` and if you only need a couple of specific datasets, list them directly e.g.
+
+```bash
+ns prepare_data gsm8k human-eval mmlu ifeval
+```
+
+If you have the repo cloned locally, the data files will be available inside `nemo_skills/dataset/<benchmark>/<split>.jsonl`
+and if you installed from pip, they will be downloaded to wherever the repo is installed, which you can figure out by running
+
+```bash
+python -c "import nemo_skills; print(nemo_skills.__path__)"
+```
+
+Some benchmarks (e.g. ruler) require extra parameters to be passed to the prepare_data script. Thus you'd need to explicitly
+call `ns prepare_data` for all of them, e.g. for ruler you can use
+
+```bash
+ns prepare_data ruler --setup=llama_128k --tokenizer_path=meta-llama/Llama-3.1-8B-Instruct --max_seq_length=131072
+```
+
+## Greedy decoding
+
+```bash
+ns eval \
+    --cluster=local \
+    --server_type=openai \
+    --model=meta/llama-3.1-8b-instruct \
+    --server_address=https://integrate.api.nvidia.com/v1 \
+    --benchmarks=gsm8k,human-eval \
+    --output_dir=/workspace/test-eval
+```
+
+This will run evaluation on gsm8k and human-eval for Llama 3.1 8B model. If you're running
+on slurm by default each benchmark is run in a separate job, but you can control this with
+`--num_jobs` parameter.
+
+After the evaluation is done, you can get metrics by calling
+
+```bash
+ns summarize_results --cluster local /workspace/test-eval
+```
+
+Which should print the following
+
+```
+---------------------------------------- gsm8k ----------------------------------------
+evaluation_mode | num_entries | avg_tokens | gen_seconds | symbolic_correct | no_answer
+pass@1          | 1319        | 180        | 164         | 81.96%           | 4.93%
+
+
+------------------------------------------- human-eval -------------------------------------------
+evaluation_mode | num_entries | avg_tokens | gen_seconds | passing_base_tests | passing_plus_tests
+pass@1          | 164         | 199        | 29          | 64.63%             | 60.37%
+```
+
+The [summarize_results](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/pipeline/summarize_results.py) script
+will fetch the results from cluster automatically if you ran the job there.
+
+!!! note
+
+    The numbers above don't match reported numbers for Llama 3.1 because we are not using
+    the same prompts by default. You would need to modify the prompt config for each specific benchmark
+    to match the results exactly. E.g. to match gsm8k numbers add `++prompt_config=llama3/gsm8k`
+    (but we didn't include all the prompts used for Llama3 evaluation, only a small subset as an example).
+
+## Using multiple samples
+
+You can add `:<num repeats>` after the benchmark name to repeat evaluation multiple times with high temperature
+that can be used for majority voting or estimating pass@k. E.g. if we run with
+
+```bash
+ns eval \
+    --cluster=local \
+    --server_type=openai \
+    --model=meta/llama-3.1-8b-instruct \
+    --server_address=https://integrate.api.nvidia.com/v1 \
+    --benchmarks gsm8k:4,human-eval:4 \
+    --output_dir=/workspace/test-eval
+```
+
+you will see the following output after summarizing results
+
+```
+---------------------------------------- gsm8k -----------------------------------------
+evaluation_mode  | num_entries | avg_tokens | gen_seconds | symbolic_correct | no_answer
+pass@1[avg-of-4] | 1319        | 180        | 680         | 80.44%           | 6.31%
+majority@4       | 1319        | 180        | 680         | 88.40%           | 0.15%
+pass@4           | 1319        | 180        | 680         | 93.63%           | 0.15%
+
+
+-------------------------------------------- human-eval -------------------------------------------
+evaluation_mode  | num_entries | avg_tokens | gen_seconds | passing_base_tests | passing_plus_tests
+pass@1[avg-of-4] | 164         | 215        | 219         | 64.63%             | 59.30%
+pass@4           | 164         | 215        | 219         | 79.27%             | 74.39%
+```
+
+
+## Using data on cluster
+
+Some benchmarks (e.g. ruler) have very large input datasets and it's inefficient to prepare them on local machine and
+keep uploading on cluster with every evaluation job. Instead, you can prepare them on cluster directly. To do that,
+run prepare_data command with `--data_dir` and `--cluster` options, e.g.
+
+```bash
+ns prepare_data \
+    --data_dir=/workspace/ns-data \
+    --cluster=slurm \
+    ruler --setup llama_128k --tokenizer_path meta-llama/Llama-3.1-8B-Instruct --max_seq_length 130900
+```
+
+Then during evaluation, you'd need to provide the same `data_dir` argument and it will read the data from cluster
+directly. You can also use `NEMO_SKILLS_DATA_DIR` environment variable instead of an explicit argument.
+
+Here is an example evaluation command for ruler that uses data_dir parameter
+
+```python
+from nemo_skills.pipeline.cli import eval, wrap_arguments
+
+eval(
+    # using a low number of concurrent requests since it's almost entirely prefill stage
+    ctx=wrap_arguments("++max_concurrent_requests=32"),
+    cluster="slurm",
+    model="/hf_models/Meta-Llama-3.1-8B-Instruct",
+    server_type="sglang",
+    output_dir="/workspace/eval-ruler",
+    data_dir="/workspace/ns-data",
+    benchmarks="ruler.llama_128k",
+    server_gpus=8,
+    expname="eval-ruler",
+)
+```
+
+## How the benchmarks are defined
+
+Each benchmark exists as a separate folder inside
+[nemo_skills/dataset](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset). Inside
+those folders there needs to be `prepare.py` script which can be run to download and format benchmark
+data into a .jsonl input file (or files if it supports train/validation besides a test split) that
+our scripts can understand. There also needs to be an `__init__.py` that defines some default variables
+for that benchmark, such as prompt config, evaluation type, metrics class and a few more.
+
+This information is than used inside eval pipeline to initialize default setup (but all arguments can
+be changed from the command line).
+
+Let's look at gsm8k to understand a bit more how each part of the evaluation works.
+
+Inside [nemo_skills/dataset/gsm8k/\_\_init\_\_.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset/gsm8k/__init__.py) we see the following
+
+```python
+# settings that define how evaluation should be done by default (all can be changed from cmdline)
+PROMPT_CONFIG = 'generic/math'
+DATASET_GROUP = 'math'
+METRICS_TYPE = "math"
+EVAL_ARGS = "++eval_type=math"
+GENERATION_ARGS = ""
+```
+
+The prompt config and default generation arguments are passed to the
+[nemo_skills/inference/generate.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/inference/generate.py) and
+the default eval args are passed to the
+[nemo_skills/evaluation/evaluate_results.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/evaluation/evaluate_results.py).
+The dataset group is used by [nemo_skills/dataset/prepare.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset/prepare.py)
+to help download only benchmarks from a particular group if `--dataset_groups` parameter is used.
+Finally, the metrics class is used by [nemo_skills/evaluation/metrics.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/evaluation/metrics.py)
+which is called when you run summarize results pipeline.
+
+To create a new benchmark in most cases you only need to add a new prepare script and the corresponding
+default prompt. If the new benchmark needs some not-supported post-processing or metric summarization
+you'd need to also add a new evaluation type and a new metrics class.
\ No newline at end of file
--- a/docs/pipelines/generation.md
+++ b/docs/pipelines/generation.md
--- a/docs/pipelines/index.md
+++ b/docs/pipelines/index.md
+# Pipelines
+
+## Basics
+
+NeMo-Skills has a large collection of building blocks that you can use to construct various pipelines to improve LLMs.
+All of the "pipeline" scripts are located in the [nemo_skills/pipeline](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/pipeline)
+folder and have a unified interface that help us connect them together.
+
+Each pipeline script is a wrapper that accepts *wrapper* arguments that tell us how to orchestrate the job. These
+arguments are directly listed in the corresponding Python function or visible when you run `ns <wrapper script> --help`.
+Any other arguments that you pass to the wrapper script are directly passed into the *main* job that the wrapper
+launches. These arguments are never checked when you submit a job, so if you have some mistake in them, you will only
+know about that when the job starts running. For most of our *main* scripts we use [Hydra](https://hydra.cc/) and thus
+their arguments typically start with `++arg_name`. If you're using Python API you would need to specify all *main* arguments with
+`ctx=wrap_arguments("...")` interface for technical reasons.
+
+This might sound a little complicated, so let's see how it works through an example from the [Getting Started Tutorial](../basics/index.md).
+
+=== "ns interface"
+
+    ```bash
+    ns generate \
+        --cluster=local \
+        --server_type=trtllm \
+        --model=/workspace/qwen2.5-1.5b-instruct-trtllm \
+        --server_gpus=1 \
+        --output_dir=/workspace/generation-local-trtllm \
+        --input_file=/workspace/input.jsonl \
+        ++prompt_config=/workspace/prompt.yaml \
+        ++prompt_template=qwen-instruct
+    ```
+
+=== "python interface"
+
+    ```python
+    from nemo_skills.pipeline.cli import wrap_arguments, generate
+
+    generate(
+        cluster="local",
+        server_type="trtllm",
+        model="/workspace/qwen2.5-1.5b-instruct-trtllm",
+        server_gpus=1,
+        output_dir="/workspace/generation-local-trtllm",
+        input_file="/workspace/input.jsonl",
+        ctx=wrap_arguments(
+            "++prompt_config=/workspace/prompt.yaml "
+            "++prompt_template=qwen-instruct "
+        ),
+    )
+    ```
+
+In this command all arguments starting with `--` are *wrapper* arguments and everything starting with `++` are *main* arguments.
+If you run `ns generate --help` you will see all the ones with `--` listed there (and more), but not the `++` ones.
+The help output also contains this message that specifies which underlying *main* script we run for this command and how
+to check its arguments
+
+```bash
+`python -m nemo_skills.inference.generate --help` for other supported arguments
+```
+
+You can also open that script's code in
+[nemo_skills/inference/generate.py](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/inference/generate.py)
+and see all arguments and logic there.
+
+You can chain multiple pipelines together to set proper slurm dependencies using `--run_after` parameter.
+There is an example in [tutorial](../basics/index.md#slurm-inference) or in
+[training documentation](training.md#chaining-pipelines-with-python).
+
+
+## Common parameters
+
+Many of our scripts have a shared set of common parameters that we list here.
+
+### All pipeline scripts
+
+All scripts inside pipeline folder have the following parameters.
+
+- **--cluster**: You always need to specify a cluster config that will be used to
+  control where the job is executed.
+- **--config_dir**: By default we search for cluster configs inside `cluster_configs`
+  local folder, but you can control where they are located with this parameter.
+  You can also use `NEMO_SKILLS_CONFIG_DIR` environment variable for this purpose.
+- **--log_dir**: Can be used to customize the location of slurm logs.
+- **--expname**: You can always specify an experiment name, which is a
+  [NeMo-Run](https://github.com/NVIDIA/NeMo-Run) concept. This will control where
+  the metadata is stored, the slurm job name and allows you to chain jobs one
+  after the other using the `--run_after` argument.
+- **--run_after**: Can be used in conjunction with `--expname` to chain jobs to
+  run one after another (only applicable on slurm). E.g. run training job with
+  `--expname my-training-run` and then launch an eval with `--run_after my-training-run`.
+- **--mount_paths**: Can be used to mount additional paths to the cluster config dynamically.
+  This is useful if you want to access some data that is not mounted in cluster config. E.g. use
+  `--mount_paths /my/remote/workspace:/workspace` to mount `/workspace` folder from the host
+  machine to the slurm job.
+- **--check_mounted_paths**: This flag offers a few different capabilities for convenience:
+    - Check if the paths specified in the script are mounted correctly. This is useful if you want to make
+    sure that the paths that are mounted are available  on remote machine before running the job.
+    E.g. use `--check_mounted_paths` to check if `/my/remote/workspace` folder from the host machine
+    is a folder that exists and can be mounted.
+    - In many cases, if the directory does not exist, we will create it for you. This is useful for
+    output and log directories.
+    - If paths are provided but not mounted, often times we will dynamically mount them for you.
+- **--partition**: Can be used to run in a specific slurm partition (e.g. commonly used
+  to launch interactive jobs).
+- **--not_exclusive**: Can be used if you want to request a part o the slurm node. By default
+  we set `exclusive=True`.
+- **--time_min**: Can be used to specify minimum time after which the job might be killed by slurm.
+  Specify in the following format `00:30:00` (for 30 minutes). Using a lower value will help jobs
+  get scheduled faster.
+- **--reuse_code** / **--reuse_code_exp**: Can be used to specify another experiment and reuse
+  its code (to avoid re-packaing/uploading to cluster). If running from Python we will automatically
+  reuse the last submitted experiment in the current Python session.
+
+### Generation scripts
+
+All of the scripts that involve LLM data generation accept a common set of parameters.
+
+- **--model**: Either path to the model file or an API model name.
+- **--server_type**: `nemo`, `trtllm`, `vllm` or `openai`. This is used on the client side
+  to correctly format a request to a particular server. This needs to match model
+  checkpoint format if self-hosting the model or has to be `openai` for both Nvidia
+  NIM API as well as the OpenAI API.
+- **--server_address**: Only relevant for API models. E.g. use
+  `https://integrate.api.nvidia.com/v1` for Nvidia API and
+  `https://api.openai.com/v1` for OpenAI API.
+- **--server_gpus**: Number of GPUs needed to host a model (only applicable to self-hosted models).
+- **--server_nodes**: Number of nodes needed to host a model (only applicable to self-hosted models).
+- **--server_args**: Any other arguments you need to pass to a corresponding server.
+  E.g. use `--server_args="--gpu-memory-utilization=0.99"` to change gpu memory utilization of a
+  vLLM server.
\ No newline at end of file
--- a/docs/pipelines/llm-as-a-judge.md
+++ b/docs/pipelines/llm-as-a-judge.md
+# LLM-as-a-judge for math evaluation
+
+!!! info
+
+    This pipeline starting script is [nemo_skills/pipeline/generate.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/pipeline/generate.py)
+
+    All extra parameters are passed to [nemo_skills/inference/llm_math_judge.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/inference/llm_math_judge.py)
+
+When evaluating complex mathematical questions, it's very hard to have a rule-based symbolic comparison system.
+While we do perform such comparison by default, for most accurate results it's best to use LLM-as-a-judge pipeline.
+E.g. symbolic comparison can perform very inaccurately for multi-choice questions where an answer might either be
+one of the letters or an expression corresponding to that letter.
+
+If you have an output of the [evaluation script](evaluation.md) on e.g. math benchmark, you can run LLM-as-a-judge
+in the following way (assuming you have `/workspace` mounted in your [cluster config](../basics/cluster-configs.md)
+and evaluation output available in `/workspace/test-eval/eval-results`).
+
+```bash
+ns generate \
+    --generation_type=math_judge \
+    --cluster=local \
+    --model=gpt-4o \
+    --server_type=openai \
+    --server_address=https://api.openai.com/v1 \
+    --output_dir=/workspace/test-eval-judge/eval-results/math \
+    --input_dir=/workspace/test-eval/eval-results/math \
+    --num_random_seeds=<num seeds used for generation>
+```
+
+This will run the judge pipeline on the data inside `eval-results/math` folder and judge solutions from `output-rsX.jsonl` files.
+If you ran the benchmark with a single generation (e.g. using `math` or `math:0`) then
+use `--input_file=/workspace/test-eval/eval-results/math/output.jsonl` instead of `--input_dir` and `--num_random_seeds` arguments.
+
+In this example we use gpt-4o from OpenAI, but you can use Llama-405B (that you can host on cluster yourself) or any
+other models. If you have multiple benchmarks, you would need to run the command multiple times.
+After the judge pipeline has finished, you can see the results by running
+
+```bash
+ns summarize_results /workspace/test-eval-judge --cluster local
+```
+
+Which should output something like this
+
+```
+------------------------------------------------- aime24 ------------------------------------------------
+evaluation_mode | num_entries | symbolic_correct | judge_correct | both_correct | any_correct | no_answer
+pass@1          | 30          | 20.00            | 20.00         | 20.00        | 20.00       | 13.33
+
+
+------------------------------------------------- gsm8k -------------------------------------------------
+evaluation_mode | num_entries | symbolic_correct | judge_correct | both_correct | any_correct | no_answer
+pass@1          | 1319        | 95.00            | 95.75         | 95.00        | 95.75       | 0.00
+
+
+-------------------------------------------------- math -------------------------------------------------
+evaluation_mode | num_entries | symbolic_correct | judge_correct | both_correct | any_correct | no_answer
+pass@1          | 5000        | 67.32            | 67.88         | 67.02        | 68.18       | 2.64
+
+
+------------------------------------------------- amc23 -------------------------------------------------
+evaluation_mode | num_entries | symbolic_correct | judge_correct | both_correct | any_correct | no_answer
+pass@1          | 40          | 47.50            | 47.50         | 47.50        | 47.50       | 7.50
+```
+
+If you want to see where symbolic comparison differs from judge comparison, run with `--debug` option.
+
+We use the following [judge prompt](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/prompt/config/judge/math.yaml)
+by default, but you can customize it the same way as you [customize any other prompt](../basics/prompt-format.md).
\ No newline at end of file
--- a/docs/pipelines/run-cmd.md
+++ b/docs/pipelines/run-cmd.md
+# Running arbitrary commands
+
+!!! info
+
+    This pipeline starting script is [nemo_skills/pipeline/run_cmd.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/pipeline/run_cmd.py)
+
+    All extra parameters are directly executed as a shell command.
+
+We often need to run arbitrary pre/post processing commands as part of a larger pipeline and thus we provide a simple
+`run_cmd` utility that can be used to schedule those on slurm. Here is an example that simply enters the packaged
+code and tries to install it (will finish with error if not running from NeMo-Skills repo or other installable package).
+
+```bash
+ns run_cmd --cluster=local cd /nemo_run/code/ && pip install -e .
+```
+
+There are many more examples of how to use `run_cmd` throughout our documentation.
+
+## LLM Server and Sandbox Server
+
+While we can run arbitrary commands with the default `run_cmd` script, we also provide the ability to
+run a LLM server with the `--model` argument and a few extra arguments for the server config. These arguments 
+are similar to the ones used for `start_server` script.
+
+This can be useful to run a server on a local machine or on a cluster with GPUs in a slurm job, while also being able to
+run arbirary code that uses LLM calls. 
+
+### Example
+
+Say you have the following inference file that uses OpenAI API with a vLLM backed server (say to run a 
+project that is compatible with OpenAI API). Imagine a file called `inference.py` with the following code:
+
+```python
+from openai import OpenAI
+client = OpenAI(api_key='EMPTY', base_url=f"http://0.0.0.0:5000/v1", timeout=None)
+api_model = client.models.list().data[0].id
+
+response = client.chat.completions.create(
+    model=api_model,
+    messages=[
+        {"role": "user", "content": "What is the capital of France?"},
+    ],
+    temperature=0.7,
+    max_tokens=128,
+    top_p=0.95,
+    n=1,
+    stream=False,
+)
+print(response.choices[0].message.content)
+```
+
+Then we can run the server and the inference code in a single command as below. The --with_sandbox argument starts the 
+code execution server that can be used to run arbitrary code in a sandboxed environment and is added here just as a 
+demonstration. While the current example does not use it, this can be useful to execute code or to run code that 
+requires a specific environment in a container. 
+
+**Note**: While the container is a little more secure than running code directly on the host, it is still not a fully 
+secure sandbox and should not be used to run untrusted code.
+
+```bash
+ns run_cmd \
+    --cluster=local \
+    --model=Qwen/Qwen3-1.7B \
+    --server_type=vllm \
+    --server_gpus=1 \
+    --with_sandbox \
+    cd /nemo_run/code/ && python inference.py
+```
+
+This will launch the LLM inference server, the sandbox server and then run the inference code.
\ No newline at end of file
--- a/docs/pipelines/training-verl-openrlhf.md
+++ b/docs/pipelines/training-verl-openrlhf.md
+# Training using verl or OpenRLHF
+
+!!! info
+
+    Depending on the algorithm/framework, this pipeline starting script is
+
+    * [nemo_skills/pipeline/openrlhf/sft.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/pipeline/openrlhf/sft.py)
+
+    * [nemo_skills/pipeline/openrlhf/ppo.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/pipeline/openrlhf/sft.py)
+
+    * [nemo_skills/pipeline/verl/ppo.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/pipeline/verl/ppo.py)
+
+    All extra parameters are passed to
+
+    * [openrlhf.cli.train_sft](https://github.com/OpenRLHF/OpenRLHF/blob/main/openrlhf/cli/train_sft.py)
+
+    * [openrlhf.cli.train_ppo_ray](https://github.com/OpenRLHF/OpenRLHF/blob/main/openrlhf/cli/train_ppo_ray.py)
+
+    * [verl.trainer.main_ppo](https://github.com/volcengine/verl/blob/main/verl/trainer/main_ppo.py)
+
+!!! warning
+
+    OpenRLHF support is experimental and incomplete. We use the following
+    custom fork and it might not be easy to switch to official repositories versions.
+
+    * OpenRLHF: https://github.com/Kipok/OpenRLHF
+
+    The documentation here is incomplete and we advise you to open an issue if you
+    plan to try something that is not covered below to get additional support.
+
+    For OpenRLHF, please use the following non-default container `vllm: igitman/nemo-skills-vllm:0.6.0`
+
+## SFT with OpenRLHF
+
+Here is an example of running SFT job with OpenRLHF.
+Our standard [SFT data format](./training.md#preparing-the-data) can be
+used here.
+
+```bash
+from nemo_skills.pipeline.cli import wrap_arguments, sft_openrlhf
+
+sft_openrlhf(
+    ctx=wrap_arguments(""),
+    cluster="slurm",
+    expname="test-openrlhf-sft",
+    output_dir="/workspace/test-openrlhf-sft",
+    hf_model="/hf_models/Qwen2.5-1.5B-Instruct",
+    training_data="/data/sft-data.jsonl",
+    num_gpus=8,
+    num_nodes=2,
+    num_training_jobs=1,
+)
+```
+
+## PPO with OpenRLHF
+
+Here is an example of running PPO job with OpenRLHF.
+Our standard [SFT data format](./training.md#preparing-the-data) can be
+used here.
+
+```python
+from nemo_skills.pipeline.cli import wrap_arguments, ppo_openrlhf
+
+ppo_openrlhf(
+    ctx=wrap_arguments(
+        "--ref_num_gpus_per_node=4 "
+        "--actor_num_gpus_per_node=4 "
+        "--vllm_num_engines=2 "
+        "--vllm_tensor_parallel_size=2 "
+        "--ref_num_nodes=1 "
+        "--actor_num_nodes=1 "
+        "--colocate_actor_ref "
+        "--advantage_estimator=reinforce "
+        "--remote_rm_url /nemo_run/code/nemo_skills/training/openrlhf/math_reward.py "
+    ),
+    cluster="slurm",
+    expname="test-openrlhf-ppo",
+    output_dir="/workspace/test-openrlhf-ppo",
+    hf_model="/hf_models/Qwen2.5-1.5B-Instruct",
+    prompt_data="/data/rl-data.jsonl",
+    num_gpus=8,
+    num_nodes=2,
+    # this is used for the LLM judge
+    server_gpus=8,
+    server_type='trtllm',
+    server_model='/hf_models/Qwen2.5-32B-Instruct',
+    num_training_jobs=1,
+)
+```
+
+## PPO with verl
+
+Here is an example of running PPO job with verl.
+You can use [nemo_skills/training/verl/prepare_data.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/training/verl/prepare_data.py) to convert
+our standard [SFT data format](./training.md#preparing-the-data) into parquet.
+
+```python
+from nemo_skills.pipeline.cli import wrap_arguments, ppo_verl
+
+ppo_verl(
+    ctx=wrap_arguments(
+        '++trainer.save_freq=0 '
+        '++data.train_batch_size=32 '
+        '++reward_model.compute_score=math-judge '
+        '++reward_model.reward_manager=batched '
+        '++data.filter_prompts=False '
+        '++actor_rollout_ref.rollout.gpu_memory_utilization=0.7 '
+        '++data.max_response_length=12000 '
+        '++actor_rollout_ref.rollout.n=64 '
+        '++actor_rollout_ref.rollout.tensor_model_parallel_size=2 '
+    ),
+    cluster="slurm",
+    expname="test-verl-ppo",
+    output_dir="/workspace/test-verl-ppo",
+    hf_model="/hf_models/Qwen2.5-1.5B-Instruct",
+    prompt_data="/data/rl-data.parquet",
+    num_gpus=8,
+    num_nodes=2,
+    # this is used for the LLM judge
+    server_gpus=8,
+    server_type='trtllm',
+    server_model='/hf_models/Qwen2.5-32B-Instruct',
+    num_training_jobs=1,
+)
+```
\ No newline at end of file
--- a/docs/pipelines/training.md
+++ b/docs/pipelines/training.md
+# Training using NeMo-Aligner
+
+!!! info
+
+    This pipeline starting script is [nemo_skills/pipeline/train.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/pipeline/train.py)
+
+    All extra parameters are passed to either [nemo_skills/training/start_sft.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/training/start_sft.py) or [nemo_skills/training/start_dpo.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/training/start_dpo.py)
+
+
+## Preparing the data
+
+Before running the training we need to prepare the data in the right format. Here is an example command
+
+```bash
+python -m nemo_skills.training.prepare_data \
+    ++input_files="<path to the generated synthetic data>/output-rs*.jsonl"> \
+    ++output_path=sft-data.jsonl \
+    ++prompt_config=generic/math \
+    ++prompt_template=llama3-instruct
+```
+
+!!! tip
+
+    Many scripts accept `++input_files` argument. You can use any glob patterns there and also
+    reference multiple files/patterns separated by space or comma.
+
+If you want to run that command inside container or on cluster, add `ns run_cmd --cluster=...` in the beginning.
+
+You need to pass in the config/template files so that we can format the data accordingly. There are many more parameters
+that data preparation script supports which you can see
+[here](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/training/data_preparation_utils/config/math_sft.yaml).
+We are using [SDP library](https://github.com/NVIDIA/NeMo-speech-data-processor) for preparing the data, so it's
+a good idea to check their documentation to understand how this config is structured.
+
+!!! note
+
+    Even though we support both SFT and DPO training, the data preparation is currently only implemented
+    for SFT jobs. For DPO, you'd need to manually prepare the data according to the
+    [NeMo-Aligner documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/modelalignment/dpo.html#dpo-model-training)
+
+
+## Running training
+
+We use [NeMo-Aligner](https://github.com/NVIDIA/NeMo-Aligner/) to run LLM training,
+so you can check their documentation to learn about all supported parameters.
+
+Here is an example of how to run a training job.
+
+```bash
+ns train \
+    --cluster=slurm \
+    --expname=my-training-job \
+    --output_dir=/workspace/my-training-job/checkpoints \
+    --nemo_model=/nemo_models/llama3.1-8b-base \
+    --num_nodes=8 \
+    --num_gpus=8 \
+    --num_training_jobs=4 \
+    --training_data=/data/sft-data.jsonl
+```
+
+This will run training on 8 nodes of 8 GPUs, using 4 dependent slurm jobs.
+By default we are training for 2 epochs, saving checkpoints every 1000 steps,
+but you can adjust these values. It's also recommended to tune micro batch size
+and tensor parallel parameters for optimal performance. E.g. these are good
+defaults for an 8B model size
+
+```bash
+    ++model.data.train_ds.micro_batch_size=4 \
+    ++model.tensor_model_parallel_size=4
+```
+
+You can customize any of the SFT parameters by directly providing them, e.g.
+to disable wandb logging and add dropout use
+
+```bash
+   --disable_wandb \
+   ++model.ffn_dropout=0.1 \
+   ++model.attention_dropout=0.1 \
+   ++model.hidden_dropout=0.1
+```
+
+The training script will average all of your generated checkpoints upon completion
+(we found this to consistently increase the downstream accuracy). If you want to
+only average a subset of checkpoint, add `--average_steps` parameter (e.g. if you
+want to disable averaging, set it to the last training step). If you only want
+to average the checkpoints of the finished job, set `--num_training_jobs=0`.
+
+## Chaining pipelines with Python
+
+Typically after training we want to follow up with evaluation. You can schedule
+an evaluation job right away by providing a `--run_after=my-training-job` argument
+which will appropriately set slurm dependencies. Here is how you can chain the commands
+to schedule checkpoint conversion and evaluation after training
+(whenever you need to run multiple commands, it's more convenient to use python interface)
+
+```python
+from nemo_skills.pipeline.cli import wrap_arguments, train, convert, eval
+
+expname = "my-training-job"
+cluster = "slurm"
+output_dir = f"/workspace/{expname}/checkpoints"
+
+train(
+    ctx=wrap_arguments(""),
+    cluster=cluster,
+    expname=expname,
+    output_dir=output_dir,
+    nemo_model="/nemo_models/llama3.1-8b-base",
+    num_nodes=8,
+    num_gpus=8,
+    num_training_jobs=4,
+    training_data="/data/sft-data.jsonl",
+)
+
+convert(
+    ctx=wrap_arguments(""),
+    cluster=cluster,
+    input_model=f"{output_dir}/model-averaged-nemo",
+    output_model=f"{output_dir}/model-averaged-hf",
+    expname=f"{expname}-to-hf",
+    run_after=expname,
+    convert_from="nemo",
+    convert_to="hf",
+    model_type="llama",
+    num_gpus=8,
+    hf_model_name="meta-llama/Meta-Llama-3.1-8B",
+)
+
+eval(
+    ctx=wrap_arguments("++prompt_template=llama3-instruct"),
+    cluster=cluster,
+    model=f"{output_dir}/model-averaged-hf",
+    server_type="trtllm",
+    output_dir=f"{output_dir}/results/",
+    benchmarks="gsm8k,math",
+    server_gpus=8,
+    run_after=f"{expname}-to-hf",
+)
+```
+
+## Using sequence packing and context parallel
+
+When training on sequences >4k or so, it's recommended to use sequence packing and context parallel.
+Here is an example how to do that. Most of the parameters don't need to change, but
+the `global_batch_size` might need to be adjusted to be n times smaller than without packing
+where n is the average number of sequences per pack, that packing script outputs, e.g.
+
+```
+[NeMo I 2025-01-16 13:57:37 prepare_packed_ft_dataset:165] Packing sequences to length 16384...
+[NeMo I 2025-01-16 15:06:24 prepare_packed_ft_dataset:182] Packing is 98.23% efficient
+[NeMo I 2025-01-16 15:06:24 prepare_packed_ft_dataset:183] >>>>> For pack size 16384, average number of sequences per pack is n = 3.669 <<<<<
+```
+
+Here is an example of running packing and training.
+
+```python
+from nemo_skills.pipeline.cli import wrap_arguments, train, run_cmd
+
+expname = "my-training-job"
+cluster = "slurm"
+output_dir = f"/workspace/{expname}/checkpoints"
+
+# your memory consumption will be similar to a job with
+# `pack_seq_length / context_parallel` sequences without packing
+pack_seq_length = 16384
+context_parallel = 4
+
+original_bs = 512
+avg_sequences_per_pack = 3.7
+# you need to make sure this is divisible by your data parallel rank,
+# so might need to round to a power of 2
+packed_bs = original_bs // avg_sequences_per_pack
+
+# Make sure that train_ds.file_names is included in the bucket e.g., [/data/sft-data.jsonl]
+packing_cmd = (
+    f"python /nemo_run/code/nemo_skills/training/prepare_packed_ft_dataset.py "
+    f"    ++model.data.train_ds.file_names=[/data/sft-data.jsonl] "
+    f"    ++model.data.train_ds.max_seq_length={pack_seq_length} "
+    f"    ++model.context_parallel_size={context_parallel} "
+    f"    ++tokenizer_path=/hf_models/Meta-Llama-3.1-8B "
+    f"    ++output_dir=/data "
+    f"    ++pack_sizes=[{pack_seq_length}] "
+    f"    ++model.data.train_ds.hf_dataset=True "
+)
+
+run_cmd(
+    ctx=wrap_arguments(packing_cmd),
+    cluster=cluster,
+    expname=f"{expname}-packing",
+    container="nemo", # please use "nemo container" for packed data prepration
+    # this is a cpu-only operation, so if a cluster has a good cpu partition, it can be used
+    # note that this is an expensive operation requiring a lot of CPUs and RAM
+)
+
+
+# The `packing_cmd` generates three files when `pack_seq_length=16384` is used, for example:
+
+#  `packed_16384_seed0.input_ids.npy`
+#  `packed_16384_seed0.loss_mask.npy`
+#  `packed_16384_seed0.seq_start_id.npy`
+
+# For training, set training_data=packed_16384_seed0.npy
+# Refer to the _load_dataset_alt function in nemo_skills/training/gpt_sft_dataset.py for details on why this is required.
+
+train(
+    ctx=wrap_arguments(
+        f"++model.data.train_ds.packed_sequence=True "
+        f"++model.data.train_ds.micro_batch_size=1 "  # should always be 1 for packed jobs
+        f"++model.data.train_ds.global_batch_size={packed_bs} "
+        f"++model.context_parallel_size={context_parallel} "
+        f"++model.data.train_ds.max_seq_length={pack_seq_length} "
+        # all other parameters are generally the same as for the non-packed job with
+        # max seq length = packed_seq_length / context_parallel
+        # and keep in mind that each step now processes avg_sequences_per_pack * packed_bs examples
+    ),
+    cluster=cluster,
+    expname=expname,
+    run_after=f"{expname}-packing",
+    output_dir=output_dir,
+    nemo_model="/nemo_models/llama3.1-8b-base",
+    num_nodes=8,
+    num_gpus=8,
+    num_training_jobs=4,
+    training_data=f"/data/packed_{pack_seq_length}_seed0.npy",
+)
+
+# can follow up with the same convert/eval steps as above
+```
+
+If your data size is very large (i.e. >1M samples), you might run out of memory when doing packing on full data.
+If that's the case, it's recommended to split data into smaller chunks and then merge them using
+[nemo_skills/training/merge_packed_data.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/training/merge_packed_data.py)
+
+Example command:
+
+```bash
+python nemo_skills/training/merge_packed_data.py \
+    --input_prefixes <chunk 1 folder>/packed_24576_seed0 <chunk 2 folder>/packed_24576_seed0 \
+    --output_prefix <final data folder>/packed_24576_seed0
+```
--- a/docs/releases/index.md
+++ b/docs/releases/index.md
+---
+title: Papers & Releases
+hide:
+  - toc
+---
+
+On this page you can find a list of papers, model and dataset releases that were created using NeMo-Skills.
+
+## Releases
+
+* [OpenReasoning](openreasoning/index.md) models
+
+* [OpenCodeReasoning](opencodereasoning/index.md) dataset and models
+
+* [OpenMathReasoning](openmathreasoning/index.md) dataset and models
+
+* [OpenMathInstruct-2](openmathinstruct2/index.md) dataset and models
+
+## Papers
+
+* [GenSelect: A Generative Approach to Best-of-N](https://openreview.net/pdf?id=8LhnmNmUDb){:target="_blank"} (2025)
+
+* [The Challenge of Teaching Reasoning to LLMs Without RL or Distillation](https://arxiv.org/abs/2507.09850){:target="_blank"} (2025)
+
+* [OpenCodeReasoning: Advancing Data Distillation for Competitive Coding](https://arxiv.org/abs/2504.01943){:target="_blank"} (2025)
+
+* [AIMO-2 Winning Solution: Building State-of-the-Art Mathematical Reasoning Models with OpenMathReasoning dataset](https://arxiv.org/abs/2504.16891){:target="_blank"} (2025)
+
+* [OpenMathInstruct-2: Accelerating AI for Math with Massive Open-Source Instruction Data](https://arxiv.org/abs/2410.01560){:target="_blank"} (2024)
+
+* [OpenMathInstruct-1: A 1.8 Million Math Instruction Tuning Dataset](https://arxiv.org/abs/2402.10176){:target="_blank"} (2024)
--- a/docs/releases/opencodereasoning/dataset.md
+++ b/docs/releases/opencodereasoning/dataset.md
+# Dataset construction
+
+[OpenCodeReasoning-1](https://huggingface.co/datasets/nvidia/OpenCodeReasoning) and [OpenCodeReasoning-2](https://huggingface.co/datasets/nvidia/OpenCodeReasoning-2)  dataset consists of competitve coding problems collected from [TACO](https://huggingface.co/datasets/BAAI/TACO), [APPS](https://huggingface.co/datasets/codeparrot/apps), [CodeContests](https://huggingface.co/datasets/deepmind/code_contests) and [CodeForces](https://huggingface.co/datasets/open-r1/codeforces). Below we describe the pipeline used to create this dataset. All relevant scripts are available in
+[recipes/opencodereasoning](https://github.com/NVIDIA/NeMo-Skills/tree/main/recipes/opencodereasoning) folder.
+
+If you don't have a slurm cluster with a large number of GPUs,
+you can still try out all the steps of our pipeline by using [Nvidia NIM models](https://build.nvidia.com/). You can extract the questions set in its entirety following the [prepare_questions.py script](https://github.com/NVIDIA/NeMo-Skills/tree/main/recipes/opencodereasoning/pipeline/prepare_questions.py) and you can
+switch to that data and NIM models by adding `--mode demo` to the pipeline commands. We also use different models
+in this "demo" mode to make it faster, but you can change [configs/demo.yaml](https://github.com/NVIDIA/NeMo-Skills/tree/main/recipes/opencodereasoning/configs/demo.yaml) to pick
+any other models supported in https://build.nvidia.com. Make sure to define `NVIDIA_API_KEY` environment variable for this to work
+(and ignore scraping and model preparation steps as they are not needed when using NIM models).
+
+Finally, please make sure to go through the
+[getting started documentation](../../basics/index.md) to make sure you understand how the below commands
+work and avoid running into errors.
+
+
+## Data preparation (Question set)
+
+The question set is preprocessed as part of the [prepare_questions.py](https://github.com/NVIDIA/NeMo-Skills/tree/main/recipes/opencodereasoning/pipeline/prepare_questions.py) script. This script will download the original datasets, extract just the questions and filter out super long instructions that may interfere with training.
+
+**Note**: OCR-1 questions are a subset of OCR-2 questions, and it is recommended to generate data for OCR-2 directly.
+
+To download and preprocess the question set you can run the following script. We assume out /workspace points to the directory where NeMo-Skills is cloned, but you can change the output directory to any other location:
+
+```bash
+python prepare_questions.py --cluster local --expname "toy" --output_dir "/workspace/recipes/opencodereasoning/data/"
+```
+
+This script will download the 4 individual seed datasets above, along with the OpenCodeReasoning-2 dataset in order to perform a mapping from question ids to questions, gather the unique questions in the dataset, truncate the discussions that are longer than 3200 Qwen 2.5 tokens. The prepared data will be saved as `open_code_reasoning_questions.jsonl`.
+
+The output file should have ~34K rows, so all of the following commands will take a very long time and require a big
+number of GPUs if you want to run them on full data. If you just want to try out the full pipeline, we recommend to subsample
+the dataset by e.g. running
+
+```bash
+mv open_code_reasoning_questions.jsonl open_code_reasoning_questions_full.jsonl
+head -n 1000 open_code_reasoning_questions_full.jsonl > open_code_reasoning_questions.jsonl
+```
+
+**Note**: The questions from this dataset are already decontaminated against LiveCodeBench v6 2408-2505. However if you are evaluating against a newer version of LiveCodeBench, you may need to perform decontamination yourself. You can follow the instructions here to construct [decontamination pipeline](https://nvidia.github.io/NeMo-Skills/pipelines/decontamination/).
+
+## Solution generation pipeline
+
+[Solution generation pipeline](https://github.com/NVIDIA/NeMo-Skills/tree/main/recipes/opencodereasoning/pipeline/prepare_solutions.py)
+consists of the following stages:
+
+1. Generate solutions using some reasoning model for each of the prepared problems (`generate_solutions` stage).
+2. Filter the solutions based on whether the reasoning trace completed successfully or not (`filter_solutions` stage).
+
+You can run the full pipeline with
+
+```
+python recipes/opencodereasoning/pipeline/prepare_solutions.py --mode r1
+```
+
+You can specify a subset of stages using `--stages` argument, e.g. `--stages generate_solutions` or `--stages generate_solutions,filter_solutions`.
+
+If you want to run using [Nvidia NIM models](https://build.nvidia.com/models) , change to `--mode demo`.
+
--- a/docs/releases/opencodereasoning/evaluation.md
+++ b/docs/releases/opencodereasoning/evaluation.md
+# Model evaluation
+
+Here are the commands you can run to reproduce our evaluation numbers.
+The commands below are for [nvidia/OpenCodeReasoning-Nemotron-1.1-7B](https://huggingface.co/nvidia/OpenCodeReasoning-Nemotron-1.1-7B) model as an example.
+We assume you have `/workspace` defined in your [cluster config](../../basics/cluster-configs.md) and are
+executing all commands from that folder locally. Change all commands accordingly
+if running on slurm or using different paths.
+
+## Download models
+
+Get the model from HF. E.g.
+
+```bash
+# cd into your /workspace folder
+pip install -U "huggingface_hub[cli]"
+huggingface-cli download nvidia/OpenCodeReasoning-Nemotron-1.1-7B --local-dir OpenCodeReasoning-Nemotron-1.1-7B
+```
+## Prepare evaluation data
+
+```bash
+ns prepare_data livecodebench
+```
+
+## Run evaluation
+
+```bash
+ns eval \
+    --cluster=local \
+    --model=/workspace/OpenCodeReasoning-Nemotron-1.1-7B \
+    --server_type=vllm \
+    --output_dir=/workspace/OpenCodeReasoning-Nemotron-1.1-7B-eval \
+    --benchmarks=livecodebench:8 \
+    --split=test_v6_2408_2505 \
+    --server_gpus=1 \
+    ++prompt_template=qwen-instruct \
+    ++inference.tokens_to_generate=64000
+```
+
+Finally, to print the metrics run
+
+```bash
+ns summarize_results /workspace/OpenCodeReasoning-Nemotron-1.1-7B-eval/eval-results --cluster local
+```
+
+The numbers may vary by 1-2% depending on the server type, number of GPUs and batch size used.
--- a/docs/releases/opencodereasoning/index.md
+++ b/docs/releases/opencodereasoning/index.md
+# OpenCodeReasoning
+
+This section has instructions for training a model that attains results similar to
+[OpenCodeReasoning](https://arxiv.org/abs/2504.01943).
+
+Please note that unless you have an access to a large GPU cluster, it might take a very long time
+for some of the commands to complete!
+
+- [Model evaluation](evaluation.md)
+- [Dataset construction](dataset.md)
--- a/docs/releases/openmathinstruct2/dataset.md
+++ b/docs/releases/openmathinstruct2/dataset.md
--- a/docs/releases/openmathinstruct2/evaluation.md
+++ b/docs/releases/openmathinstruct2/evaluation.md
+# Model evaluation
+
+Here are the commands you can run to reproduce our evaluation numbers.
+The commands below are for OpenMath-2-Llama3.1-8b model as an example.
+We assume you have `/workspace` defined in your [cluster config](../../basics/cluster-configs.md) and are
+executing all commands from that folder locally. Change all commands accordingly
+if running on slurm or using different paths.
+
+## Download models
+
+Get the model from HF. E.g.
+
+```bash
+pip install -U "huggingface_hub[cli]"
+huggingface-cli download nvidia/OpenMath2-Llama3.1-8B --local-dir OpenMath2-Llama3.1-8B
+```
+
+## Prepare evaluation data
+
+```bash
+ns prepare_data gsm8k math amc23 aime24 omni-math
+```
+
+## Run greedy decoding
+
+```bash
+ns eval \
+    --cluster=local \
+    --model=/workspace/OpenMath2-Llama3.1-8B \
+    --server_type=trtllm \
+    --output_dir=/workspace/openmath2-llama3.1-8b-eval \
+    --benchmarks=aime24,amc23,math,gsm8k,omni-math \
+    --server_gpus=1 \
+    --num_jobs=1 \
+    ++prompt_template=llama3-instruct \
+    ++inference.tokens_to_generate=4096
+```
+
+If running on slurm, you can set `--num_jobs` to a bigger number of -1 to run
+each benchmark in a separate node. The number of GPUs need to match what you used
+in the conversion command.
+
+After the generation is done, we want to run LLM-as-a-judge evaluation to get more
+accurate numbers than symbolic comparison. You need to define `OPENAI_API_KEY` for
+the command below to work.
+
+```bash
+for dataset in aime24 amc23 math gsm8k omni-math; do
+    ns generate \
+        --generation_type=math_judge \
+        --cluster=local \
+        --model=gpt-4o \
+        --server_type=openai \
+        --server_address=https://api.openai.com/v1 \
+        --output_dir=/workspace/openmath2-llama3.1-8b-eval-judged/eval-results/${dataset} \
+        --input_dir=/workspace/openmath2-llama3.1-8b-eval/eval-results/${dataset}
+done
+```
+
+Finally, to print the metrics run
+
+```bash
+ns summarize_results /workspace/openmath2-llama3.1-8b-eval-judged/eval-results --cluster local
+```
+
+This should print the metrics including both symbolic and judge evaluation. The judge is typically more accurate.
+
+```
+------------------------------------------------- aime24 ------------------------------------------------
+evaluation_mode | num_entries | symbolic_correct | judge_correct | both_correct | any_correct | no_answer
+pass@1          | 30          | 10.00            | 10.00         | 10.00        | 10.00       | 6.67
+
+
+------------------------------------------------- gsm8k -------------------------------------------------
+evaluation_mode | num_entries | symbolic_correct | judge_correct | both_correct | any_correct | no_answer
+pass@1          | 1319        | 90.75            | 91.70         | 90.75        | 91.70       | 0.00
+
+
+----------------------------------------------- omni-math -----------------------------------------------
+evaluation_mode | num_entries | symbolic_correct | judge_correct | both_correct | any_correct | no_answer
+pass@1          | 4428        | 18.97            | 22.22         | 18.11        | 23.08       | 2.55
+
+
+-------------------------------------------------- math -------------------------------------------------
+evaluation_mode | num_entries | symbolic_correct | judge_correct | both_correct | any_correct | no_answer
+pass@1          | 5000        | 67.70            | 68.10         | 67.50        | 68.30       | 1.36
+
+
+------------------------------------------------- amc23 -------------------------------------------------
+evaluation_mode | num_entries | symbolic_correct | judge_correct | both_correct | any_correct | no_answer
+pass@1          | 40          | 32.50            | 40.00         | 32.50        | 40.00       | 0.00
+```
+
+The numbers may vary by 1-2% depending on the server type, number of GPUs and batch size used.
+
+## Run majority voting
+
+```bash
+ns eval \
+    --cluster=local \
+    --model=/workspace/OpenMath2-Llama3.1-8B \
+    --server_type=trtllm \
+    --output_dir=/workspace/openmath2-llama3.1-8b-eval \
+    --benchmarks=aime24:256,amc23:256,math:256,gsm8k:256,omni-math:256 \
+    --server_gpus=1 \
+    --num_jobs=1 \
+    ++prompt_template=llama3-instruct \
+    ++inference.tokens_to_generate=4096
+```
+
+This will take a very long time unless you run on slurm cluster. After the generation is done, you will be able
+to see symbolic scores right away. You can evaluate with the judge by first creating new files with majority
+answers. E.g. for "math" benchmark run
+
+```bash
+python -m nemo_skills.evaluation.aggregate_answers \
+    ++input_dir="./openmath2-llama3.1-8b-eval/eval-results/math" \
+    ++input_files="output-rs*.jsonl" \
+    ++mode=extract \
+    ++output_dir="./openmath2-llama3.1-8b-eval/eval-results-majority/math"
+```
+
+This will output "./openmath2-llama3.1-8b-eval/eval-results-majority/math/output-agg.jsonl" file with majority answer. We can run the llm-judge pipeline on it.
+
+
+Repeat the above steps for all benchmarks. Now we are ready to run the judge pipeline and summarize results
+after it is finished. You need to define `OPENAI_API_KEY` for the command below to work.
+
+```bash
+for dataset in aime24 amc23 math gsm8k omni-math; do
+    ns generate \
+        --generation_type=math_judge \
+        --cluster=local \
+        --model=gpt-4o \
+        --server_type=openai \
+        --server_address=https://api.openai.com/v1 \
+        --output_dir=/workspace/openmath2-llama3.1-8b-eval-judged/eval-results-majority/${dataset} \
+        --input_file=/workspace/openmath2-llama3.1-8b-eval/eval-results-majority/${dataset}/output-agg.jsonl
+done
+```
+
+```bash
+ns summarize_results /workspace/openmath2-llama3.1-8b-eval-judged/eval-results-majority --cluster local
+```
+
+This will print majority results (they will be labeled as `greedy` since we fused them into a single file).
+You can also ignore the symbolic score as it's not accurate anymore after we filled majority answers.
--- a/docs/releases/openmathinstruct2/index.md
+++ b/docs/releases/openmathinstruct2/index.md
+# OpenMathInstruct-2
+
+Using our pipelines we created [OpenMathInstruct-2 dataset](https://huggingface.co/datasets/nvidia/OpenMathInstruct-2)
+which consists of 14M question-solution pairs (600K unique questions), making it nearly eight times larger
+than the previous largest open-source math reasoning dataset.
+
+The models trained on this dataset achieve strong results on common mathematical benchmarks.
+
+<table>
+  <tr>
+    <td style="text-align: center;">model</td>
+    <td style="text-align: center;">GSM8K</td>
+    <td style="text-align: center;">MATH</td>
+    <td style="text-align: center;">AMC 2023</td>
+    <td style="text-align: center;">AIME 2024</td>
+    <td style="text-align: center;">Omni-MATH</td>
+  </tr>
+  <tr>
+    <td style="text-align: right;">Llama3.1-8B-Instruct</td>
+    <td style="text-align: center;">84.5</td>
+    <td style="text-align: center;">51.9</td>
+    <td style="text-align: center;">9/40</td>
+    <td style="text-align: center;">2/30</td>
+    <td style="text-align: center;">12.7</td>
+  </tr>
+  <tr>
+    <td style="text-align: right;">OpenMath2-Llama3.1-8B (<a href="https://huggingface.co/nvidia/OpenMath2-Llama3.1-8B-nemo">nemo</a> | <a href="https://huggingface.co/nvidia/OpenMath2-Llama3.1-8B">HF</a>)</td>
+    <td style="text-align: center;">91.7</td>
+    <td style="text-align: center;">67.8</td>
+    <td style="text-align: center;">16/40</td>
+    <td style="text-align: center;">3/30</td>
+    <td style="text-align: center;">22.0</td>
+  </tr>
+  <tr>
+    <td style="text-align: right;">+ majority@256</td>
+    <td style="text-align: center;">94.1</td>
+    <td style="text-align: center;">76.1</td>
+    <td style="text-align: center;">23/40</td>
+    <td style="text-align: center;">3/30</td>
+    <td style="text-align: center;">24.6</td>
+  </tr>
+  <tr>
+    <td style="text-align: right;">Llama3.1-70B-Instruct</td>
+    <td style="text-align: center;">95.1</td>
+    <td style="text-align: center;">68.0</td>
+    <td style="text-align: center;">19/40</td>
+    <td style="text-align: center;">6/30</td>
+    <td style="text-align: center;">19.0</td>
+  </tr>
+  <tr>
+    <td style="text-align: right;">OpenMath2-Llama3.1-70B (<a href="https://huggingface.co/nvidia/OpenMath2-Llama3.1-70B-nemo">nemo</a> | <a href="https://huggingface.co/nvidia/OpenMath2-Llama3.1-70B">HF</a>)</td>
+    <td style="text-align: center;">94.9</td>
+    <td style="text-align: center;">71.9</td>
+    <td style="text-align: center;">20/40</td>
+    <td style="text-align: center;">4/30</td>
+    <td style="text-align: center;">23.1</td>
+  </tr>
+  <tr>
+    <td style="text-align: right;">+ majority@256</td>
+    <td style="text-align: center;">96.0</td>
+    <td style="text-align: center;">79.6</td>
+    <td style="text-align: center;">24/40</td>
+    <td style="text-align: center;">6/30</td>
+    <td style="text-align: center;">27.6</td>
+  </tr>
+</table>
+
+## Paper
+
+[OpenMathInstruct-2: Accelerating AI for Math with Massive Open-Source Instruction Data](https://arxiv.org/abs/2410.01560)
+
+If you find our work useful, please consider citing us!
+
+```bibtex
+@inproceedings{toshniwal2024openmathinstruct2,
+  title   = {{OpenMathInstruct-2: Accelerating AI for Math with Massive Open-Source Instruction Data}},
+  author  = {Shubham Toshniwal and Wei Du and Ivan Moshkov and Branislav Kisacanin and Alexan Ayrapetyan and Igor Gitman},
+  year    = {2025},
+  booktitle = {ICLR},
+}
+```
+
+## How to reproduce our results
+
+Browse the sections below to see all commands needed to fully reproduce our results.
+
+Please note that unless you have an access to a large GPU cluster, it might take a very long time
+for some of the commands to complete!
+
+- [Model evaluation](evaluation.md)
+- [Dataset construction](dataset.md)
+- [Model training](training.md)
--- a/docs/releases/openmathinstruct2/training.md
+++ b/docs/releases/openmathinstruct2/training.md
+# Model training
+
+We assume you have `/workspace` defined in your [cluster config](../../basics/cluster-configs.md) and are
+executing all commands from that folder locally. Change all commands accordingly
+if running on slurm or using different paths.
+
+## Download data
+
+Get the data from [HuggingFace](https://huggingface.co/datasets/nvidia/OpenMathInstruct-2).
+This might take 20-30 minutes (or more depending on your network connection) and will use ~20Gb of RAM.
+
+```python
+import json
+
+from datasets import load_dataset
+from tqdm import tqdm
+
+dataset = load_dataset('nvidia/OpenMathInstruct-2', split='train')
+
+print("Converting dataset to jsonl format")
+output_file = "openmathinstruct2.jsonl"
+with open(output_file, 'w', encoding='utf-8') as f:
+    for item in tqdm(dataset):
+        f.write(json.dumps(item, ensure_ascii=False) + '\n')
+
+print(f"Conversion complete. Output saved as {output_file}")
+```
+
+You can also download a subset of the data by using e.g. `split='train_5M'` that we used to train 70B model.
+See the dataset page for more details about this.
+
+## Convert to SFT format
+
+Convert the data into the SFT format that NeMo-Aligner understands.
+
+```bash
+ns run_cmd --cluster=local \
+python -m nemo_skills.training.prepare_data \
+    ++prompt_template=llama3-instruct \
+    ++prompt_config=generic/math \
+    ++preprocessed_dataset_files=/workspace/openmathinstruct2.jsonl \
+    ++output_key=generated_solution \
+    ++output_path=/workspace/openmathinstruct2-sft.jsonl \
+    ++hf_model_name="meta-llama/Meta-Llama-3.1-8B" \
+    ++filters.drop_multi_boxed=false \
+    ++filters.trim_prefix=false \
+    ++filters.trim_solutions=false \
+    ++filters.drop_incorrect_arithmetic=false \
+    ++filters.split_arithmetic=false \
+    ++filters.remove_contaminated=false
+```
+
+## Prepare base model
+
+Download the base model and convert it to NeMo format.
+The instructions below are for Llama3.1-8B, but the same commands should work for 70B model as well.
+
+```bash
+pip install -U "huggingface_hub[cli]"
+huggingface-cli download meta-llama/Llama-3.1-8B --local-dir Llama-3.1-8B
+
+ns convert \
+    --cluster=local \
+    --input_model=/workspace/Llama-3.1-8B \
+    --output_model=/workspace/llama3.1-8b-nemo \
+    --convert_from=hf \
+    --convert_to=nemo \
+    --model_type=llama \
+    --num_gpus=1 \
+    --hf_model_name=meta-llama/Llama-3.1-8B
+```
+
+## Run training
+
+Run the training (assuming slurm configuration here with the same folder structure). If your cluster has strict
+timeout policy, you can run multiple dependent jobs with `--num_training_jobs=N`.
+
+```bash
+ns train \
+    --cluster=slurm \
+    --expname=openmathinstruct2-repro-8b \
+    --output_dir=/workspace/openmathinstruct2-repro/checkpoints \
+    --nemo_model=/workspace/llama3.1-8b-nemo \
+    --num_nodes=8 \
+    --num_gpus=8 \
+    --average_steps=10000,20000,30000,40000,50000,60000 \
+    --training_data=/workspace/openmathinstruct2-sft.jsonl \
+    ++model.data.train_ds.micro_batch_size=8 \
+    ++model.tensor_model_parallel_size=4 \
+    ++model.pipeline_model_parallel_size=1 \
+    ++model.optim.lr=2e-5 \
+    ++trainer.sft.save_interval=10000 \
+    ++trainer.sft.max_steps=60000 \
+    ++trainer.sft.max_epochs=100
+```
+
+For 70B model, we used 5M data subset and the following parameters, but training
+it longer is likely going to improve results.
+
+```bash
+ns train \
+    --cluster=slurm \
+    --expname=openmathinstruct2-repro-70b \
+    --output_dir=/workspace/openmathinstruct2-repro-70b/checkpoints \
+    --nemo_model=/workspace/llama3.1-70b-nemo \
+    --num_nodes=32 \
+    --num_gpus=8 \
+    --average_steps=3330,6660,9990,13320,16650,20000 \
+    --training_data=/workspace/openmathinstruct2-sft-5M.jsonl \
+    ++model.data.train_ds.micro_batch_size=1 \
+    ++model.tensor_model_parallel_size=8 \
+    ++model.pipeline_model_parallel_size=2 \
+    ++model.optim.lr=1e-5 \
+    ++trainer.sft.save_interval=3330 \
+    ++trainer.sft.max_steps=20000 \
+    ++trainer.sft.max_epochs=100
+```
+
+If you have a job timeout, it's necessary to set the maximum time per run to 40 minutes
+before the timeout to allow for the final checkpoint to be saved. E.g. if your timeout is 4 hours,
+add `++exp_manager.max_time_per_run=00:03:20:00`
+
+
+If you want to follow up with checkpoint conversion and evaluation, see
+[training docs](../../pipelines/training.md#chaining-pipelines-with-python) for an example of how to do it
+through a convenient Python API.
--- a/docs/releases/openmathreasoning/dataset.md
+++ b/docs/releases/openmathreasoning/dataset.md
--- a/docs/releases/openmathreasoning/evaluation.md
+++ b/docs/releases/openmathreasoning/evaluation.md
+# Model evaluation
+
+Here are the commands you can run to reproduce our evaluation numbers.
+The commands below are for [OpenMath-Nemotron-1.5B](https://huggingface.co/nvidia/OpenMath-Nemotron-1.5B) model as an example.
+We assume you have `/workspace` defined in your [cluster config](../../basics/cluster-configs.md) and are
+executing all commands from that folder locally. Change all commands accordingly
+if running on slurm or using different paths.
+
+!!! tip "Interactive Chat Interface"
+
+    Besides the benchmark numbers shown below, you can also interactively chat with OpenMath models using our
+    [chat interface](../../basics/chat_interface.md). This allows you to easily test both Chain-of-Thought (CoT) and
+    Tool-Integrated Reasoning (TIR) modes with code execution in a user-friendly web UI.
+
+!!! note
+
+    For small benchmarks such as AIME24 and AIME25 (30 problems each) it is expected to see significant variation
+    across different evaluation reruns. We've seen the difference as large as 6% even for results that are averaged
+    across 64 generations. So please don't expect to see exactly the same numbers as presented in our paper, but
+    they should be within 3-6% of reported results.
+
+## Download models
+
+Get the model from HF. E.g.
+
+```bash
+pip install -U "huggingface_hub[cli]"
+huggingface-cli download nvidia/OpenMath-Nemotron-1.5B --local-dir OpenMath-Nemotron-1.5B
+```
+
+## Prepare evaluation data
+
+```bash
+ns prepare_data comp-math-24-25 hle
+```
+
+## Run CoT evaluations
+
+```bash
+ns eval \
+    --cluster=local \
+    --model=/workspace/OpenMath-Nemotron-1.5B \
+    --server_type=trtllm \
+    --output_dir=/workspace/openmath-nemotron-1.5b-eval-cot \
+    --benchmarks=comp-math-24-25:64 \
+    --server_gpus=1 \
+    --num_jobs=1 \
+    ++prompt_template=qwen-instruct \
+    ++prompt_config=generic/math \
+    ++inference.tokens_to_generate=32768 \
+    ++inference.temperature=0.6
+
+ns eval \
+    --cluster=local \
+    --model=/workspace/OpenMath-Nemotron-1.5B \
+    --server_type=trtllm \
+    --output_dir=/workspace/openmath-nemotron-1.5b-eval-cot \
+    --benchmarks=hle:64 \
+    --server_gpus=1 \
+    --num_jobs=1 \
+    --split=math \
+    ++prompt_template=qwen-instruct \
+    ++prompt_config=generic/math \
+    ++inference.tokens_to_generate=32768 \
+    ++inference.temperature=0.6
+```
+
+This will take a very long time unless you run on slurm cluster.
+If running on slurm, you can set `--num_jobs` to a bigger number of -1 to run
+each benchmark in a separate node. The number of GPUs need to match what you used
+in the conversion command.
+
+For comp-math-24-25 our symbolic checker is good enough, so we can see the results right away by running
+
+```bash
+ns summarize_results /workspace/openmath-nemotron-1.5b-eval-cot/eval-results/comp-math-24-25 --metric_type math --cluster local
+```
+
+For hle-math it's necessary to run LLM-as-a-judge step to get accurate evaluation results.
+We used [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) which you
+can run with the following command, assuming you have the model downloaded and converted locally
+or on cluster.
+
+```bash
+ns generate \
+    --generation_type=math_judge \
+    --cluster=local \
+    --model=/hf_models/Qwen2.5-32B-Instruct \
+    --server_type=trtllm \
+    --server_gpus=4 \
+    --output_dir=/workspace/openmath-nemotron-1.5b-eval-cot/eval-results-judged/hle \
+    --input_dir=/workspace/openmath-nemotron-1.5b-eval-cot/eval-results/hle
+```
+
+Alternatively, you can use an API model like gpt-4o, but the results might be different.
+You need to define `OPENAI_API_KEY` for the command below to work.
+
+```bash
+ns generate \
+    --generation_type=math_judge \
+    --cluster=local \
+    --model=gpt-4o \
+    --server_type=openai \
+    --server_address=https://api.openai.com/v1 \
+    --output_dir=/workspace/openmath-nemotron-1.5b-eval-cot/eval-results-judged/hle \
+    --input_dir=/workspace/openmath-nemotron-1.5b-eval-cot/eval-results/hle
+```
+
+To print the metrics run
+
+```bash
+ns summarize_results /workspace/openmath-nemotron-1.5b-eval-cot/eval-results-judged/hle --metric_type math --cluster local
+```
+
+This should print the metrics including both symbolic and judge evaluation.
+
+## Run TIR evaluations
+
+To get TIR evaluation numbers, replace the generation commands like this
+
+```bash
+ns eval \
+    --cluster=local \
+    --model=/workspace/OpenMath-Nemotron-1.5B \
+    --server_type=trtllm \
+    --output_dir=/workspace/openmath-nemotron-1.5b-eval-tir \
+    --benchmarks=comp-math-24-25:64 \
+    --server_gpus=1 \
+    --num_jobs=1 \
+    --with_sandbox \
+    ++code_tags=openmath \
+    ++prompt_template=qwen-instruct \
+    ++prompt_config=openmath/tir \
+    ++inference.tokens_to_generate=32768 \
+    ++inference.temperature=0.6 \
+    ++code_execution=true \
+    ++server.code_execution.add_remaining_code_executions=true \
+    ++total_code_executions_in_prompt=8
+```
+
+The only exception is for [OpenMath-Nemotron-14B-Kaggle](https://huggingface.co/nvidia/OpenMath-Nemotron-14B-Kaggle)
+you should use the following options instead
+
+```bash
+ns eval \
+    --cluster=local \
+    --model=/workspace/openmath-nemotron-14b-kaggle-trtllm \
+    --server_type=trtllm \
+    --output_dir=/workspace/openmath-nemotron-14b-kaggle-eval-tir \
+    --benchmarks=comp-math-24-25:64 \
+    --server_gpus=1 \
+    --num_jobs=1 \
+    --with_sandbox \
+    ++code_tags=openmath \
+    ++prompt_template=qwen-instruct \
+    ++prompt_config=generic/math \
+    ++inference.tokens_to_generate=32768 \
+    ++inference.temperature=0.6 \
+    ++code_execution=true
+```
+
+All other commands are the same as in the [CoT part](#run-cot-evaluations).
+
+
+## Run GenSelect evaluations
+
+Here is a sample command to run GenSelect evaluation:
+
+```bash
+ns genselect \
+    --preprocess_args="++input_dir=/workspace/openmath-nemotron-1.5b-eval-cot/eval-results-judged/hle" \
+    --model=/trt_models/openmath-nemotron-1.5b \
+    ++prompt_template=qwen-instruct \
+    --output_dir=/workspace/openmath-nemotron-1.5b-eval-cot/self_genselect_hle \
+    --cluster=local \
+    --server_type=trtllm \
+    --server_gpus=1 \
+    --num_random_seeds=64
+```
+
+The output folder will have three folders (apart from log folders):
+
+1. `comparison_instances`: This is the folder where input instances for genselect are kept.
+
+2. `comparison_judgment`: Output of GenSelect judgments.
+
+3. `hle` / `math`: Folder with outputs based on GenSelect's judgments. If `dataset` is not specified in the command, we create a folder with the name `math`
+
+To print the metrics run:
+
+```bash
+ns summarize_results \
+  /workspace/openmath-nemotron-1.5b-eval-cot/self_genselect_hle/hle \
+  --metric_type math \
+  --cluster local
+```
--- a/docs/releases/openmathreasoning/index.md
+++ b/docs/releases/openmathreasoning/index.md
+---
+date: 2025-04-23
+---
+
+# OpenMathReasoning
+
+## OpenMathReasoning Dataset
+
+Using our pipelines we created [OpenMathReasoning dataset](https://huggingface.co/datasets/nvidia/OpenMathReasoning).
+This dataset contains
+
+* 306K unique mathematical problems sourced from [AoPS forums](https://artofproblemsolving.com/community) with:
+    * 3.2M long chain-of-thought (CoT) solutions
+    * 1.7M long tool-integrated reasoning (TIR) solutions
+    * 566K samples that select the most promising solution out of many candidates (GenSelect)
+* Additional 193K problems sourced from AoPS forums (problems only, no solutions)
+
+We used [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) to preprocess problems, and
+[DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) and [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) to generate solutions.
+
+This dataset was a foundation of our winning submission to the
+[AIMO-2 Kaggle competition](https://www.kaggle.com/competitions/ai-mathematical-olympiad-progress-prize-2/leaderboard).
+
+See our [paper](https://arxiv.org/abs/2504.16891) to learn more details!
+
+## OpenMath-Nemotron Models
+
+To demonstrate the quality of this dataset, we release a series of OpenMath-Nemotron models trained on this data.
+
+* [OpenMath-Nemotron-1.5B](https://huggingface.co/nvidia/OpenMath-Nemotron-1.5B)
+* [OpenMath-Nemotron-7B](https://huggingface.co/nvidia/OpenMath-Nemotron-7B)
+* [OpenMath-Nemotron-14B](https://huggingface.co/nvidia/OpenMath-Nemotron-14B)
+* [OpenMath-Nemotron-14B-Kaggle](https://huggingface.co/nvidia/OpenMath-Nemotron-14B-Kaggle) (this is the model used in [AIMO-2 Kaggle competition](https://www.kaggle.com/competitions/ai-mathematical-olympiad-progress-prize-2/leaderboard))
+* [OpenMath-Nemotron-32B](https://huggingface.co/nvidia/OpenMath-Nemotron-32B)
+
+![Evaluation Results](./openmath-results.png)
+
+The models achieve strong results on popular mathematical benchmarks. We present metrics as pass@1 (maj@64) where pass@1
+is an average accuracy across 64 generations and maj@64 is the result of majority voting.
+Please see our [paper](https://arxiv.org/abs/2504.16891) for more details on the evaluation setup.
+
+| Model                                                                                            | AIME24      | AIME25      | HMMT-24-25  | HLE-Math    |
+| ------------------------------------------------------------------------------------------------ | ----------- | ----------- | ----------- | ----------- |
+| DeepSeek-R1-Distill-Qwen-1.5B                                                                    | 26.8 (60.0) | 21.4 (36.7) | 14.2 (26.5) | 2.9 (5.0)   |
+| [OpenMath-Nemotron-1.5B](https://huggingface.co/nvidia/OpenMath-Nemotron-1.5B) CoT               | 61.6 (80.0) | 49.5 (66.7) | 39.9 (53.6) | 5.4 (5.4)   |
+| [OpenMath-Nemotron-1.5B](https://huggingface.co/nvidia/OpenMath-Nemotron-1.5B) TIR               | 52.0 (83.3) | 39.7 (70.0) | 37.2 (60.7) | 2.5 (6.2)   |
+| + Self GenSelect                                                                                 | 83.3        | 70.0        | 62.2        | 7.9         |
+| + 32B GenSelect                                                                                  | 83.3        | 70.0        | 62.8        | 8.3         |
+| DeepSeek-R1-Distill-Qwen-7B                                                                      | 54.4 (80.0) | 38.6 (53.3) | 30.6 (42.9) | 3.3 (5.2)   |
+| [OpenMath-Nemotron-7B](https://huggingface.co/nvidia/OpenMath-Nemotron-7B) CoT                   | 74.8 (80.0) | 61.2 (76.7) | 49.7 (57.7) | 6.6 (6.6)   |
+| [OpenMath-Nemotron-7B](https://huggingface.co/nvidia/OpenMath-Nemotron-7B) TIR                   | 72.9 (83.3) | 57.5 (76.7) | 54.6 (66.3) | 7.8 (10.8)  |
+| + Self GenSelect                                                                                 | 86.7        | 76.7        | 68.4        | 11.5        |
+| + 32B GenSelect                                                                                  | 86.7        | 76.7        | 69.9        | 11.9        |
+| DeepSeek-R1-Distill-Qwen-14B                                                                     | 65.8 (80.0) | 48.4 (60.0) | 40.1 (52.0) | 4.2 (4.8)   |
+| [OpenMath-Nemotron-14B-MIX (kaggle)](https://huggingface.co/nvidia/OpenMath-Nemotron-14B-Kaggle) | 73.7 (86.7) | 57.9 (73.3) | 50.5 (64.8) | 5.7 (6.5)   |
+| [OpenMath-Nemotron-14B](https://huggingface.co/nvidia/OpenMath-Nemotron-14B) CoT                 | 76.3 (83.3) | 63.0 (76.7) | 52.1 (60.7) | 7.5 (7.6)   |
+| [OpenMath-Nemotron-14B](https://huggingface.co/nvidia/OpenMath-Nemotron-14B) TIR                 | 76.3 (86.7) | 61.3 (76.7) | 58.6 (70.9) | 9.5 (11.5)  |
+| + Self GenSelect                                                                                 | 86.7        | 76.7        | 72.4        | 14.1        |
+| + 32B GenSelect                                                                                  | 90.0        | 76.7        | 71.9        | 13.7        |
+| QwQ-32B                                                                                          | 78.1 (86.7) | 66.5 (76.7) | 55.9 (63.3) | 9.0 (9.5)   |
+| DeepSeek-R1-Distill-Qwen-32B                                                                     | 66.9 (83.3) | 51.8 (73.3) | 39.9 (51.0) | 4.8 (6.0)   |
+| [OpenMath-Nemotron-32B](https://huggingface.co/nvidia/OpenMath-Nemotron-32B) CoT                 | 76.5 (86.7) | 62.5 (73.3) | 53.0 (59.2) | 8.3 (8.3)   |
+| [OpenMath-Nemotron-32B](https://huggingface.co/nvidia/OpenMath-Nemotron-32B) TIR                 | 78.4 (93.3) | 64.2 (76.7) | 59.7 (70.9) | 9.2 (12.5)  |
+| + Self GenSelect                                                                                 | 93.3        | 80.0        | 73.5        | 15.7        |
+| DeepSeek-R1                                                                                      | 79.1 (86.7) | 64.3 (73.3) | 53.0 (59.2) | 10.5 (11.4) |
+
+## Paper
+
+[AIMO-2 Winning Solution: Building State-of-the-Art Mathematical Reasoning Models with OpenMathReasoning dataset](https://arxiv.org/abs/2504.16891)
+
+If you find our work useful, please consider citing us!
+
+```bibtex
+@article{moshkov2025aimo2,
+  title   = {{AIMO-2 Winning Solution: Building State-of-the-Art Mathematical Reasoning Models with OpenMathReasoning dataset}},
+  author  = {Ivan Moshkov and Darragh Hanley and Ivan Sorokin and Shubham Toshniwal and Christof Henkel and Benedikt Schifferer and Wei Du and Igor Gitman},
+  year    = {2025},
+  journal = {arXiv preprint arXiv:2504.16891}
+}
+```
+
+## How to reproduce our results
+
+Browse the sections below to see all commands needed to fully reproduce our results.
+
+Please note that unless you have an access to a large GPU cluster, it might take a very long time
+for some of the commands to complete!
+
+- [Model evaluation](evaluation.md)
+- [Dataset construction](dataset.md)
+- [Model training](training.md)
--- a/docs/releases/openmathreasoning/openmath-results.png
+++ b/docs/releases/openmathreasoning/openmath-results.png
--- a/docs/releases/openmathreasoning/training.md
+++ b/docs/releases/openmathreasoning/training.md
--- a/docs/releases/openreasoning/dataset.md
+++ b/docs/releases/openreasoning/dataset.md
+# Dataset construction
+
+!!! note
+
+    This page has instructions for how to re-generate datasets from scratch. If you just want to download existing
+    data that we released, you can use the scripts in the [training documentation](./training.md#download-data-and-convert-to-sft-format).
+
+Here are the commands you can run to re-create our synthetic dataset.
+We assume you have `/workspace` defined in your [cluster config](../../basics/cluster-configs.md) and are
+running commands with a Slurm config. Change all commands accordingly if running locally or using different paths.
+
+## Math data
+
+### Solution generation
+
+We use problems from [OpenMathReasoning](https://huggingface.co/datasets/nvidia/OpenMathReasoning) dataset. So first,
+download them using this Python snippet and put inside `/workspace/open-reasoning/sdg` on your Slurm cluster.
+
+We found that the quality of converted proof problems is not high, so we are excluding them here.
+
+```python
+from datasets import concatenate_datasets, load_dataset
+
+def remove_proofs(example):
+    return example['problem_type'] != 'converted_proof'
+
+dataset = load_dataset("nvidia/OpenMathReasoning")
+
+dataset['cot'] = dataset['cot'].remove_columns(['generation_model', 'generated_solution', 'inference_mode', 'used_in_kaggle'])
+dataset['additional_problems'] = dataset['additional_problems'].remove_columns(['generation_model', 'generated_solution', 'inference_mode', 'used_in_kaggle'])
+full_data = concatenate_datasets([dataset['cot'], dataset['additional_problems']])
+full_data = full_data.filter(remove_proofs, num_proc=20)
+
+full_data.to_json("math-problems.jsonl")
+```
+
+Next, prepare the [DeepSeek-R1-0528](https://huggingface.co/deepseek-ai/DeepSeek-R1-0528) to run on Slurm.
+Here we assume that model is hosted on 16 H100 GPUs, but other GPU configurations are possible with corresponding
+modifications to commands.
+
+To download the model you can run the following from `/workspace` folder on Slurm.
+We will also need [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) to use as the judge
+for answer correctness.
+
+```bash
+huggingface-cli download deepseek-ai/DeepSeek-R1-0528 --local-dir DeepSeek-R1-0528
+huggingface-cli download Qwen/Qwen2.5-32B-Instruct --local-dir Qwen2.5-32B-Instruct
+```
+
+The next step is optional, but we recommend sharding the checkpoint to avoid very long loading time.
+
+```python
+from nemo_skills.pipeline.cli import run_cmd, wrap_arguments
+
+cmd = (
+    "python3 nemo_skills/conversion/save_sharded_state.py "
+    "    --model-path=/workspace/DeepSeek-R1-0528 "
+    "    --output=/workspace/DeepSeek-R1-0528-tp16 "
+    "    --tensor-parallel-size=16 "
+    "    --context-len=8192 "
+    "    --trust-remote-code "
+    "    --nnodes 2 "
+    "    --dist-init-addr $SLURM_MASTER_NODE:20000 "
+    "    --node-rank $SLURM_PROCID "
+)
+
+run_cmd(
+    ctx=wrap_arguments(cmd),
+    cluster="slurm",
+    num_gpus=8,
+    num_nodes=2,
+    container="sglang",
+    log_dir="/workspace/DeepSeek-R1-0528-tp16",
+)
+```
+
+Finally, launch the data generation command. You can adjust `num_chunks` (how many jobs to launch in parallel) and
+`dependent_jobs` (how many jobs to launch sequentially in case there is a fixed timeout on cluster) to fit your setup.
+
+```python
+from nemo_skills.pipeline.cli import generate, run_cmd, wrap_arguments
+
+cluster = 'slurm'
+tokens_to_generate = 32768
+num_solutions = 16
+
+# Main generation - this will take a lot of time and GPUs!
+# You can select a subset of data to run on if you want to test things
+generate(
+    ctx=wrap_arguments(
+        f"++prompt_config=generic/math "
+        f"++inference.temperature=0.6 "
+        f"++inference.tokens_to_generate={tokens_to_generate} "
+    ),
+    cluster=cluster,
+    input_file="/workspace/open-reasoning/sdg/math-problems.jsonl",
+    output_dir="/workspace/open-reasoning/sdg/solutions",
+    expname="r1-0528-math-solutions",
+    model="/workspace/DeepSeek-R1-0528-tp16",
+    server_type="sglang",
+    server_gpus=8,
+    server_nodes=2,
+    server_args=f"--load-format sharded_state --context-length {tokens_to_generate + 2000}",
+    num_random_seeds=num_solutions,
+    # set these according to your cluster configuration
+    # num_chunks=N,
+    # dependent_jobs=M,
+)
+
+# Judge step, this one is very fast as it just compares the predicted
+# and expected answers for each solution, doesn't check reasoning
+generate(
+    ctx=wrap_arguments(""),
+    cluster=cluster,
+    generation_type="math_judge",
+    input_dir=f"/workspace/open-reasoning/sdg/solutions",
+    output_dir=f"/workspace/open-reasoning/sdg/solutions-judged",
+    expname="r1-0528-math-solutions-judge",
+    run_after="r1-0528-math-solutions",
+    model="/workspace/Qwen2.5-32B-Instruct",
+    server_type="sglang",
+    server_gpus=8,
+    num_random_seeds=num_solutions,
+)
+
+# We then change all "expected_answer" values to the majority
+# from R1 if there is not a single match. While there are some really
+# hard problems for which this will not be correct, we found that
+# in most cases when R1 is not able to match GT answer even one time,
+# the GT answer itself is not correct.
+run_cmd(
+    ctx=wrap_arguments(
+        "python /nemo_run/code/recipes/openreasoning/scripts/use_majority_if_no_answer.py "
+        "    /workspace/open-reasoning/sdg/solutions-judged "
+        "    /workspace/open-reasoning/sdg/maj-if-no-correct "
+    ),
+    cluster=cluster,
+    expname="change-to-majority-if-no-correct",
+    run_after="r1-0528-math-solutions-judge",
+    log_dir="/workspace/open-reasoning/sdg/maj-if-no-correct",
+)
+
+# Next we re-judge the data to keep matches with the new majority answer
+# (should cover non-string match cases like 0.5 vs 1/2)
+generate(
+    ctx=wrap_arguments(""),
+    cluster=cluster,
+    generation_type="math_judge",
+    input_dir=f"/workspace/open-reasoning/sdg/maj-if-no-correct",
+    output_dir=f"/workspace/open-reasoning/sdg/maj-if-no-correct-judged",
+    expname="r1-0528-math-solutions-judge-after-majority",
+    run_after="change-to-majority-if-no-correct",
+    model="/workspace/Qwen2.5-32B-Instruct",
+    server_type="sglang",
+    server_gpus=8,
+    num_random_seeds=num_solutions,
+)
+
+# As the final step we convert this data to the format that can be used for SFT.
+# This script will also filter anything not judged as correct
+cmd = (
+    "python -m nemo_skills.training.prepare_data "
+    "    ++prompt_template=qwen-instruct "
+    "    ++prompt_config=generic/math "
+    "    ++input_files='/workspace/open-reasoning/sdg/maj-if-no-correct-judged/output-rs*.jsonl' "
+    "    ++output_path=/workspace/open-reasoning/sft-data-math.jsonl "
+    "    ++filters.drop_multi_boxed=false "
+    "    ++filters.trim_prefix=false "
+    "    ++filters.remove_no_think_tags=true "
+    "    ++filters.remove_contaminated=false "  # OpenMathReasoning is already decontaminated
+    "    ++filters.remove_len_outlier_solutions=false "
+    "    ++filters.remove_len_outlier_problems=false "
+    "    ++use_judgement=true "
+)
+run_cmd(
+    ctx=wrap_arguments(cmd),
+    cluster=cluster,
+    log_dir="/workspace/open-reasoning/sft-data-math-logs",
+    expname='prepare-for-sft-math',
+    run_after="r1-0528-math-solutions-judge-after-majority",
+)
+```
+
+The final data that's ready for training will then be available in `/workspace/open-reasoning/sft-data-math.jsonl`.
+
+### GenSelect data
+
+Coming soon!
+
+## Code data
+
+The code data was creating with exactly the same pipeline as used for [OpenCodeReasoning dataset](../opencodereasoning/dataset.md),
+except the solutions are generated with [DeepSeek-R1-0528](https://huggingface.co/deepseek-ai/DeepSeek-R1-0528).
+
+## Science data
+
+We generate science problems using [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) and [Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B) LLMs with the [prompt for science question generation](https://github.com/NVIDIA/NeMo-Skills/tree/main/recipes/openreasoning/prompts/science_question_generation_prompt.yaml), using few-shot examples to demonstrate the format.
+Questions are generated based on difficulty level, topic, and subtopic.
+Full dataset used for this effort is available at [HuggingFace](https://huggingface.co/datasets/nvidia/OpenScience).
+Note: HuggingFace version includes questions generated with [Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct), which are not used for OpenReasoning.
+
+The next step is to augment these problems using the [prompt for science question augmentation](https://github.com/NVIDIA/NeMo-Skills/tree/main/recipes/openreasoning/prompts/science_question_augmentation_prompt.yaml), with few-shot examples to demonstrate the format of the output.
+
+Next, we generate solutions for these problems.
+We use [DeepSeek-R1-0528](https://huggingface.co/deepseek-ai/DeepSeek-R1-0528) to generate solutions with parameters as described in the math section above.
+
+The final step is to apply majority voting over the solutions generated in the previous step to obtain the final dataset.
+
+The resulting dataset, OpenScienceReasoning-2, is available for download on Hugging Face [here](https://huggingface.co/datasets/nvidia/OpenScienceReasoning-2).
\ No newline at end of file
--- a/docs/releases/openreasoning/evaluation.md
+++ b/docs/releases/openreasoning/evaluation.md
+# Model evaluation
+
+Here are the commands you can run to reproduce our evaluation numbers.
+We assume you have `/workspace` defined in your [cluster config](../../basics/cluster-configs.md) and are
+executing all commands from that folder locally. Change all commands accordingly
+if running on slurm or using different paths.
+
+## Download models
+
+Get the models from HF. E.g.
+
+```bash
+huggingface-cli download nvidia/OpenReasoning-Nemotron-1.5B --local-dir OpenReasoning-Nemotron-1.5B
+```
+
+To evaluate HLE we used [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) model as a judge.
+You will need to download it as well if you want to reproduce HLE numbers
+
+```bash
+huggingface-cli download Qwen/Qwen2.5-32B-Instruct --local-dir Qwen2.5-32B-Instruct
+```
+
+## Prepare evaluation data
+
+```bash
+ns prepare_data aai aime24 aime25 hmmt_feb25 brumo25 livecodebench gpqa mmlu-pro hle
+```
+
+## Run evaluation
+
+!!! note
+
+    The current script only supports GenSelect evaluation for math benchmarks.
+    We will add instructions and commands for GenSelect for code and science in the next few days.
+
+We provide an evaluation script in [recipes/openreasoning/eval.py](https://github.com/NVIDIA/NeMo-Skills/tree/main/recipes/openreasoning/eval.py).
+It will run evaluation on all benchmarks and for all 4 model sizes. You can modify it directly to change evaluation settings
+or to only evaluate a subset of models / benchmarks.
+
+After the evaluation is finished, you can find `metrics.json` files in each benchmark folders with full scores.
+
+To view GenSelect scores additionally run the following commands for each benchmark and model size. E.g. for 14B and `hmmt_feb25` benchmark, run
+
+```bash
+ns summarize_results /workspace/open-reasoning-evals/14B-genselect/hmmt_feb25/math/ --metric_type math
+```
+
+which should print the following scores. Here `majority@64` is the number we are looking for.
+Note that this is majority across GenSelect runs, not original generations.
+
+```bash
+----------------------------------- math ----------------------------------
+evaluation_mode   | num_entries | avg_tokens | symbolic_correct | no_answer
+pass@1[avg-of-64] | 30          | 16066      | 85.78%           | 0.21%
+majority@64       | 30          | 16066      | 93.33%           | 0.00%
+pass@64           | 30          | 16066      | 96.67%           | 0.00%     
+```
\ No newline at end of file
--- a/docs/releases/openreasoning/genselect.png
+++ b/docs/releases/openreasoning/genselect.png
--- a/docs/releases/openreasoning/index.md
+++ b/docs/releases/openreasoning/index.md
+---
+date: 2025-07-18
+---
+
+# OpenReasoning
+
+We released OpenReasoning-Nemotrons: a suite of reasoning-capable large language models (LLMs) which have been distilled from the DeepSeek R1 0528 671B model. Trained on a massive, high-quality dataset distilled from the new DeepSeek R1 0528, our new 7B, 14B, and 32B models achieve state-of-the-art performance on a wide range of reasoning benchmarks for their respective sizes in the domain of mathematics, science and code.
+The models are available to download from **Hugging Face** ([1.5B](https://huggingface.co/nvidia/OpenReasoning-Nemotron-1.5B), [7B](https://huggingface.co/nvidia/OpenReasoning-Nemotron-7B), [14B](https://huggingface.co/nvidia/OpenReasoning-Nemotron-14B), [32B](https://huggingface.co/nvidia/OpenReasoning-Nemotron-32B)).
+
+The foundation of these models is their dataset. We generated **5 million high-quality reasoning-based solutions** by leveraging the powerful DeepSeek R1 0528 model across the domains of mathematics, coding, and science. This dataset will be released in the coming months, enabling all models to improve their reasoning capabilities on these domains.
+
+## Evaluation results
+
+![Evaluation Results with pass@1](./pass-1.png)
+
+Our models demonstrate exceptional performance across a suite of challenging reasoning benchmarks. The 7B, 14B, and 32B models consistently set new state-of-the-art records for their size classes.
+
+| **Model** | **AritificalAnalysisIndex*** | **GPQA** | **MMLU-PRO** | **HLE** | **LiveCodeBench*** | **SciCode** | **AIME24** | **AIME25** | **HMMT FEB 25**  |
+| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- |
+| **1.5B**| 31.0 | 31.6 | 47.5 | 5.5 | 28.6 | 2.2 | 55.5 | 45.6 | 31.5 |
+| **7B** | 54.7 | 61.1 | 71.9 | 8.3 | 63.3 | 16.2 | 84.7 | 78.2 | 63.5 |
+| **14B** | 60.9 | 71.6 | 77.5 | 10.1 | 67.8 | 23.5 | 87.8 | 82.0 | 71.2 |
+| **32B** | 64.3 | 73.1 | 80.0 | 11.9 | 70.2 | 28.5 | 89.2 | 84.0 | 73.8 |
+
+\* This is our estimation of the Artificial Analysis Intelligence Index, not an official score.
+
+\* LiveCodeBench version 6, date range 2408-2505.
+
+
+## Combining the work of multiple agents
+OpenReasoning-Nemotron models can be used in a "heavy" mode by starting multiple parallel generations and combining them together via [generative solution selection (GenSelect)](https://arxiv.org/abs/2504.16891). To add this "skill" we follow the original GenSelect training pipeline except we do not train on the selection summary but use the full reasoning trace of DeepSeek R1 0528 671B instead. We only train models to select the best solution for math problems but surprisingly find that this capability directly generalizes to code and science questions! With this "heavy" GenSelect inference mode, OpenReasoning-Nemotron-32B model surpasses O3 (High) on math and coding benchmarks.
+
+![Evaluation Results with GenSelect](./genselect.png)
+
+| **Model** | **Pass@1 (Avg@64)** | **Majority@64** | **GenSelect** |
+| :--- | :--- | :--- | :--- |
+| **1.5B** | | | |
+| **AIME24** | 55.5 | 76.7 | 76.7 |
+| **AIME25** | 45.6 | 70.0 | 70.0 |
+| **HMMT Feb 25** | 31.5 | 46.7 | 53.3 |
+| **7B** | | | |
+| **AIME24** | 84.7 | 93.3 | 93.3 |
+| **AIME25** | 78.2 | 86.7 | 93.3 |
+| **HMMT Feb 25** | 63.5 | 83.3 | 90.0 |
+| **LCB v6 2408-2505** | 63.4 | n/a | 67.7 |
+| **14B** | | | |
+| **AIME24** | 87.8 | 93.3 | 93.3 |
+| **AIME25** | 82.0 | 90.0 | 90.0 |
+| **HMMT Feb 25** | 71.2 | 86.7 | 93.3 |
+| **LCB v6 2408-2505** | 67.9 | n/a | 69.1 |
+| **32B** | | | |
+| **AIME24** | 89.2 | 93.3 | 93.3 |
+| **AIME25** | 84.0 | 90.0 | 93.3 |
+| **HMMT Feb 25** | 73.8 | 86.7 | 96.7 |
+| **LCB v6 2408-2505** | 70.2 | n/a | 75.3 |
+| **HLE** | 11.8 | 13.4 | 15.5 |
+
+
+
+## How to reproduce our results
+
+Browse the sections below to see all commands needed to fully reproduce our results.
+
+Please note that unless you have an access to a large GPU cluster, it might take a very long time
+for some of the commands to complete!
+
+- [Model evaluation](evaluation.md)
+- [Dataset construction](dataset.md)
+- [Model training](training.md)
--- a/docs/releases/openreasoning/pass-1.png
+++ b/docs/releases/openreasoning/pass-1.png
--- a/docs/releases/openreasoning/training.md
+++ b/docs/releases/openreasoning/training.md
+# Model training
+
+## Download data and convert to SFT format
+
+OpenReasoning dataset consists of 5 independent parts:
+
+* Math CoT data
+* Math TIR data
+* Math GenSelect data
+* Code CoT data
+* Science CoT data
+
+All datasets except GenSelect are now released. You can use code snippets below to download them and prepare for SFT.
+For final training dataset, you should concatenate all of the data together.
+
+### Math CoT data
+
+Math CoT data is released as part of the [nvidia/Nemotron-Post-Training-Dataset-v1](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v1) dataset.
+
+```python
+from functools import partial
+from datasets import load_dataset
+from nemo_skills.prompt.utils import get_prompt
+
+def apply_format(elem, prompt):
+    assert len(elem['messages']) == 2
+    elem['input'] = prompt.fill({'problem': elem['messages'][0]['content']})
+    elem['output'] = elem['messages'][1]['content'] + prompt.config.template.assistant_end
+    return elem
+
+dataset = load_dataset("nvidia/Nemotron-Post-Training-Dataset-v1", split="math")
+
+prompt = get_prompt('generic/math', 'qwen-instruct')
+func = partial(apply_format, prompt=prompt)
+dataset = dataset.map(func, num_proc=20)
+dataset = dataset.remove_columns(['messages'])
+
+dataset.to_json("open-reasoning-math-cot.jsonl")
+```
+
+### Math TIR data
+
+We re-use math TIR data from [nvidia/OpenMathReasoning](https://huggingface.co/datasets/nvidia/OpenMathReasoning) dataset.
+While we included this data in training and our released models are capable of TIR inference, we found that results are
+generally worse than using CoT. To fix this, TIR data would need to be re-generated using newer models, but this is not
+done in our current release.
+
+To get this data, follow instructions for the **second-round** SFT data in [OpenMathReasoning documentation](../openmathreasoning/training.md#second-round-sft).
+
+### Math GenSelect data
+
+Coming soon!
+
+### Code CoT data
+
+Code CoT data is released as part of the [nvidia/Nemotron-Post-Training-Dataset-v1](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v1) dataset.
+
+```python
+import json
+from functools import partial
+from datasets import load_dataset
+from nemo_skills.prompt.utils import get_prompt
+
+question_datasets = {
+    "taco": load_dataset("BAAI/TACO"),
+    "apps": load_dataset("codeparrot/apps"),
+    "code_contests": load_dataset("deepmind/code_contests"),
+    "open-r1/codeforces": load_dataset("open-r1/codeforces")
+}
+
+
+def get_question(ds_name, split, index):
+    benchmark = question_datasets[ds_name][split][int(index)]
+    if ds_name == "code_contests":
+        return benchmark["description"]
+    elif ds_name in ["taco", "apps"]:
+        return benchmark["question"]
+    elif ds_name == "open-r1/codeforces":
+        question = benchmark["description"]
+        if benchmark["input_format"]:
+            question += "\n\nInput\n\n" + benchmark["input_format"]
+        if benchmark["output_format"]:
+            question += "\n\nOutput\n\n" + benchmark["output_format"]
+        if benchmark["examples"]:
+            question += "\n\nExamples"
+            for example in benchmark["examples"]:
+                if "input" in example:
+                    question += "\n\nInput\n\n" + example["input"]
+                if "output" in example:
+                    question += "\n\nOutput\n\n" + example["output"]
+        if benchmark["note"]:
+            question += "\n\nNote\n\n" + benchmark["note"]
+        return question
+    else:
+        raise RuntimeError("Something wrong with the data!")
+
+
+def apply_format(elem, prompt):
+    metadata = json.loads(elem['metadata'])
+    question = get_question(metadata['dataset'], metadata['split'], int(metadata['index']))
+
+    elem['input'] = prompt.fill({'question': question})
+    elem['output'] = elem['messages'][1]['content'] + prompt.config.template.assistant_end
+    return elem
+
+dataset = load_dataset("nvidia/Nemotron-Post-Training-Dataset-v1", split="code")
+
+prompt = get_prompt('eval/livecodebench/python_codegen_reasoning', 'qwen-instruct')
+func = partial(apply_format, prompt=prompt)
+dataset = dataset.map(func, num_proc=20)
+dataset = dataset.remove_columns(['messages'])
+
+dataset.to_json("open-reasoning-code-cot.jsonl")
+```
+
+### Science CoT data
+
+Science CoT data is released as [nvidia/OpenScienceReasoning-2](https://huggingface.co/datasets/nvidia/OpenScienceReasoning-2) dataset.
+
+```python
+from functools import partial
+from datasets import load_dataset
+from nemo_skills.prompt.utils import get_prompt
+
+def apply_format(elem, prompt):
+    elem['input'] = prompt.fill({'question': elem['input']})
+    elem['output'] += prompt.config.template.assistant_end
+    return elem
+
+dataset = load_dataset("nvidia/OpenScienceReasoning-2", split="train")
+
+prompt = get_prompt('generic/default', 'qwen-instruct')  # data already includes instruction
+func = partial(apply_format, prompt=prompt)
+dataset = dataset.map(func, num_proc=20)
+
+dataset.to_json("open-reasoning-science-cot.jsonl")
+```
+
+
+## Train the models
+
+We mostly use the same training commands as for [OpenMathReasoning models](../openmathreasoning/training.md#run-training). The only difference
+is that we pack sequences to 49152 length and use a little different hyperparameters detailed in the following table.
+
+|                       | **lr** | **min_lr** | **TP** | **PP** | **CP** |
+| --------------------- | ------ | ---------- | ------ | ------ | ------ |
+| **Qwen2.5-Math-1.5B** | 1e-4   | 1e-7       | 1      | 1      | 4      |
+| **Qwen2.5-Math-7B**   | 1e-4   | 1e-7       | 4      | 1      | 4      |
+| **Qwen2.5-14B**       | 1e-4   | 1e-7       | 8      | 1      | 4      |
+| **Qwen2.5-32B**       | 1e-4   | 1e-7       | 8      | 2      | 4      |
+
+All models are trained for 30000 steps with a single round of SFT and we take the last checkpoint as the final model.
\ No newline at end of file
--- a/docs/tutorials/images/omr-simple-recipe/figure1.png
+++ b/docs/tutorials/images/omr-simple-recipe/figure1.png
--- a/docs/tutorials/images/omr-simple-recipe/figure2.png
+++ b/docs/tutorials/images/omr-simple-recipe/figure2.png
--- a/docs/tutorials/images/omr-simple-recipe/figure3.png
+++ b/docs/tutorials/images/omr-simple-recipe/figure3.png
--- a/docs/tutorials/index.md
+++ b/docs/tutorials/index.md
+---
+title: Tutorials
+hide:
+  - toc
+---
--- a/docs/tutorials/posts/omr-simple-recipe.md
+++ b/docs/tutorials/posts/omr-simple-recipe.md
--- a/mkdocs.yml
+++ b/mkdocs.yml
+site_name: NeMo-Skills
+site_url: https://nvidia.github.io/NeMo-Skills
+extra_css:
+  - css/extra.css
+plugins:
+  - blog:
+      blog_dir: tutorials
+      post_dir: tutorials/posts
+  - redirects:
+      redirect_maps:
+        'openmathinstruct2/index.md': 'releases/openmathinstruct2/index.md'
+        'openmathreasoning1/index.md': 'releases/openmathreasoning/index.md'
+theme:
+  name: material
+  logo: favicon.ico
+  favicon: favicon.ico
+  palette:
+    primary: blue grey
+  features:
+    - content.code.copy
+    - content.code.annotate
+    - navigation.instant
+    - navigation.instant.progress
+    - navigation.tabs
+    - navigation.tabs.sticky
+    - navigation.indexes
+    - toc.follow
+markdown_extensions:
+  - meta
+  - pymdownx.highlight:
+      anchor_linenums: true
+      line_spans: __span
+      pygments_lang_class: true
+  - pymdownx.inlinehilite
+  - pymdownx.snippets
+  - pymdownx.superfences
+  - pymdownx.tabbed:
+      alternate_style: true
+      slugify: !!python/object/apply:pymdownx.slugs.slugify
+        kwds:
+          case: lower
+  - admonition
+  - pymdownx.details
+  - pymdownx.superfences
+  - toc:
+      permalink: true
+  - attr_list
+  - pymdownx.emoji:
+      emoji_index: !!python/name:material.extensions.emoji.twemoji
+      emoji_generator: !!python/name:material.extensions.emoji.to_svg
+  - footnotes
+nav:
+  - NeMo-Skills: index.md
+  - Getting started:
+    - basics/index.md
+    - Cluster configs: basics/cluster-configs.md
+    - Code packaging: basics/code-packaging.md
+    - Prompt format: basics/prompt-format.md
+    - Inference: basics/inference.md
+    - Chat Interface: basics/chat_interface.md
+    - Sandbox for code execution: basics/sandbox.md
+  - Pipelines:
+    - pipelines/index.md
+    - Generation: pipelines/generation.md
+    - Evaluation: pipelines/evaluation.md
+    - Checkpoint conversion: pipelines/checkpoint-conversion.md
+    - LLM-as-a-judge: pipelines/llm-as-a-judge.md
+    - Decontamination: pipelines/decontamination.md
+    - Training (NeMo-Aligner): pipelines/training.md
+    - Training (verl, OpenRLHF): pipelines/training-verl-openrlhf.md
+    - Arbitrary commands: pipelines/run-cmd.md
+  - Tutorials:
+    - tutorials/index.md
+  - Papers & Releases:
+    - releases/index.md
+    - OpenReasoning:
+      - releases/openreasoning/index.md
+      - Model Evaluation: releases/openreasoning/evaluation.md
+      - Dataset construction: releases/openreasoning/dataset.md
+      - Model training: releases/openreasoning/training.md
+    - OpenCodeReasoning:
+      - releases/opencodereasoning/index.md
+      - Model Evaluation: releases/opencodereasoning/evaluation.md
+      - Dataset construction: releases/opencodereasoning/dataset.md
+    - OpenMathReasoning:
+      - releases/openmathreasoning/index.md
+      - Model Evaluation: releases/openmathreasoning/evaluation.md
+      - Dataset construction: releases/openmathreasoning/dataset.md
+      - Model training: releases/openmathreasoning/training.md
+    - OpenMathInstruct-2:
+      - releases/openmathinstruct2/index.md
+      - Model Evaluation: releases/openmathinstruct2/evaluation.md
+      - Dataset construction: releases/openmathinstruct2/dataset.md
+      - Model training: releases/openmathinstruct2/training.md
\ No newline at end of file
--- a/nemo_skills/__init__.py
+++ b/nemo_skills/__init__.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo_skills.version import __version__
+
+# only used in ns setup command to initialize with defaults
+_containers = {
+    'trtllm': 'igitman/nemo-skills-trtllm:0.6.1',
+    'vllm': 'igitman/nemo-skills-vllm:0.6.1',
+    'sglang': 'igitman/nemo-skills-sglang:0.6.1',
+    'nemo': 'igitman/nemo-skills-nemo:0.6.1',
+    'megatron': 'igitman/nemo-skills-megatron:0.6.1',
+    'sandbox': 'igitman/nemo-skills-sandbox:0.6.1',
+    'nemo-skills': 'igitman/nemo-skills:0.6.1',
+    'verl': 'igitman/nemo-skills-verl:0.6.1',
+    'nemo-rl': 'igitman/nemo-skills-nemo-rl:0.6.1',
+}
--- a/nemo_skills/code_execution/__init__.py
+++ b/nemo_skills/code_execution/__init__.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from nemo_skills.code_execution.utils import extract_code_output, extract_code_to_execute, format_code_output
--- a/nemo_skills/code_execution/local_sandbox/__init__.py
+++ b/nemo_skills/code_execution/local_sandbox/__init__.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/nemo_skills/code_execution/local_sandbox/local_sandbox_server.py
+++ b/nemo_skills/code_execution/local_sandbox/local_sandbox_server.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import logging
+import multiprocessing
+import os
+import resource
+import subprocess
+import sys
+import tempfile
+import signal
+from io import StringIO
+
+from flask import Flask, request
+
+app = Flask(__name__)
+
+MEM_LIMIT_BYTES = int(os.environ.get('NEMO_SKILLS_SANDBOX_MEM_LIMIT', 10 * 1024 ** 3))  # 10 GiB default
+
+def set_limits(mem_bytes: int = MEM_LIMIT_BYTES) -> None:
+    """
+    Apply RLIMITs and start a new session for the child process.
+
+    Called via `preexec_fn` (subprocess) or directly in a forked worker.
+    """
+    resource.setrlimit(resource.RLIMIT_AS,   (mem_bytes, mem_bytes))
+    resource.setrlimit(resource.RLIMIT_DATA, (mem_bytes, mem_bytes))
+    os.setsid()                              # isolate PGID / signals
+
+def execute_ipython(generated_code, timeout):
+    # running in a separate process to ensure any kind of crashes are properly handled
+    queue = multiprocessing.Queue()
+    process = multiprocessing.Process(target=execute_code_subprocess, args=(generated_code, queue))
+    process.start()
+    process.join(timeout=timeout)
+
+    if process.is_alive():  # didn't finish successfully
+        process.kill()
+        return {"process_status": "timeout", "stdout": "", "stderr": "Timed out\n"}
+
+    return queue.get()
+
+def execute_python(generated_code, std_input, timeout, language):
+
+    execution_command = [language, "-c", generated_code]
+    try:
+        process = subprocess.Popen(
+            execution_command,
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            text=True,
+            universal_newlines=True,
+            preexec_fn=set_limits,
+        )
+        stdout, stderr = process.communicate(input=std_input, timeout=timeout)
+        return {"process_status": "completed", "stdout": stdout, "stderr": stderr}
+    except subprocess.TimeoutExpired:
+        try:
+            # kill the whole process group
+            os.killpg(process.pid, signal.SIGKILL)
+        except ProcessLookupError:
+            pass
+        process.wait(timeout=1)  # reap, no extra timeout needed
+        return {"process_status": "timeout", "stdout": "", "stderr": "Timed out\n"}
+
+
+def execute_lean4(generated_code, timeout):
+    temp_file_name = None
+    try:
+        project_path = "/lean4/my_project"
+        with tempfile.NamedTemporaryFile(dir=project_path, delete=False, suffix=".lean") as temp_file:
+            temp_file_name = temp_file.name
+            temp_file.write(generated_code.encode('utf-8'))
+
+        result = subprocess.run(
+            ['lake', 'env', '--dir', project_path, 'lean', temp_file_name],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+            timeout=timeout,
+            cwd=project_path,  # Ensure we are in the correct working directory
+        )
+
+        if result.returncode == 0:
+            process_status = "completed"
+        else:
+            process_status = "failed"
+
+        return {
+            "process_status": process_status,
+            "stdout": result.stdout.decode('utf-8'),
+            "stderr": result.stderr.decode('utf-8'),
+        }
+
+    except subprocess.TimeoutExpired:
+        return {"process_status": "timeout", "stdout": "", "stderr": "Timed out\n"}
+    except Exception as e:
+        print(f"Error: {str(e)}")
+        return {"process_status": "error", "stdout": "", "stderr": str(e) + "\n"}
+    finally:
+        # Safely remove the temporary file if it was created
+        if temp_file_name and os.path.exists(temp_file_name):
+            os.remove(temp_file_name)
+
+
+# need to memory-limit to avoid common errors of allocating too much
+# but this has to be done in a subprocess to not crush server itself
+def execute_code_subprocess(generated_code, queue):
+
+    # this can be overriden inside generated code, so it's not a guaranteed protection
+    set_limits()
+    sys.stdout = StringIO()
+    try:
+        exec(generated_code, {})
+        queue.put(sys.stdout.getvalue())
+    except Exception as e:
+        print(f"Error: {str(e)}")
+        queue.put({"process_status": "error", "stdout": "", "stderr": str(e) + "\n"})
+
+
+# Main Flask endpoint to handle execution requests
+@app.route("/execute", methods=["POST"])
+def execute():
+    generated_code = request.json['generated_code']
+    timeout = request.json['timeout']
+    language = request.json.get('language', 'ipython')
+    std_input = request.json.get('std_input', '')
+
+    if language == 'ipython':
+        return execute_ipython(generated_code, timeout)
+    elif language == 'lean4':
+        return execute_lean4(generated_code, timeout)
+    else:
+        return execute_python(generated_code, std_input, timeout, language)
+
+
+if __name__ == '__main__':
+    log = logging.getLogger('werkzeug')
+    log.setLevel(logging.WARNING)
+    app.run(port=6000)
--- a/nemo_skills/code_execution/local_sandbox/start_local_sandbox.sh
+++ b/nemo_skills/code_execution/local_sandbox/start_local_sandbox.sh
+#!/bin/bash
+
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NOTE: needs to run from the root of the repo!
+
+SANDBOX_NAME=${1:-'local-sandbox'}
+
+docker build --tag=${SANDBOX_NAME} --build-arg="UWSGI_PROCESSES=$((nproc --all * 10))" --build-arg="UWSGI_CHEAPER=nproc --all" -f dockerfiles/Dockerfile.sandbox .
+
+docker run --network=host --rm --memory=${NEMO_SKILLS_SANDBOX_MEM_LIMIT:-"16g"} --restart unless-stopped --name=local-sandbox ${SANDBOX_NAME}
--- a/nemo_skills/code_execution/sandbox.py
+++ b/nemo_skills/code_execution/sandbox.py
--- a/nemo_skills/code_execution/utils.py
+++ b/nemo_skills/code_execution/utils.py
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import re
+from typing import Tuple
+
+from nemo_skills.utils import get_logger_name
+
+LOG = logging.getLogger(get_logger_name(__file__))
+
+
+def format_code_output(
+    execution_dict,
+    code_output_begin: str,
+    code_output_end: str,
+    code_output_format: str = 'llama',
+    remaining_code_executions: int | None = None,
+):
+    """Formatting code output to be displayed as an llm expects it."""
+    remaining_ce_string = ""
+    if remaining_code_executions is not None:
+        if remaining_code_executions > 0:
+            remaining_ce_string = (
+                f"```system\n"
+                f"Remaining code executions: {remaining_code_executions}. "
+                f"You will not be able to call code when you run out of executions, so use it wisely. "
+                f"Note that you can still continue solving the problem without code after that.\n"
+                f"```\n"
+            )
+        else:
+            remaining_ce_string = (
+                f"```system\n"
+                f"You have run out of code executions! You can no longer write or execute code. "
+                f"Now you should continue solving the problem by relying on your mathematical reasoning and analytical skills.\n"
+                f"```\n"
+            )
+    if code_output_format == 'llama':
+        output = execution_dict["process_status"]
+        if execution_dict['stdout']:
+            output += f"\n[stdout]\n{execution_dict['stdout']}[/stdout]"
+        if execution_dict['stderr']:
+            output += f"\n[stderr]\n{execution_dict['stderr']}[/stderr]"
+        output = f"{code_output_begin}\n\n{output}{remaining_ce_string}{code_output_end}\n\n"
+    elif code_output_format == 'qwen':
+        output = ""
+        if execution_dict['stdout']:
+            output += f"{execution_dict['stdout']}"
+        if execution_dict['stderr']:
+            output += f"{execution_dict['stderr']}"
+        if execution_dict['stderr'] and execution_dict['stdout']:
+            LOG.warning("Both stdout and stderr are not empty. This shouldn't normally happen! %s", execution_dict)
+        output = f"{code_output_begin}{output}{code_output_end}{remaining_ce_string}"
+    else:
+        raise ValueError(f"Unknown code_output_format: {code_output_format}")
+
+    # wrapping with code output separators
+    return output
+
+
+def _extract_between_separators(generation: str, separators: Tuple[str, str], extract_all: bool = False):
+    """Extracting all text between last occurrence of separators[0] and [1].
+
+    If extract_all is True, returning a list with all occurrences of text between separators.
+    """
+    if extract_all:
+        separators = [re.escape(sp) for sp in separators]
+        pattern = f'{separators[0]}(.*?){separators[1]}'
+        return re.findall(pattern, generation, re.DOTALL)
+    return generation.split(separators[0])[-1].split(separators[1])[0]
+
+
+def extract_code_to_execute(generation: str, code_begin: str, code_end: str, extract_all: bool = False):
+    return _extract_between_separators(generation, [code_begin, code_end], extract_all)
+
+
+def extract_code_output(generation: str, code_output_begin: str, code_output_end: str, extract_all: bool = False):
+    return _extract_between_separators(generation, [code_output_begin, code_output_end], extract_all)
+
+
+def extract_code_block(text: str, languages=None) -> str:
+    if languages is None:
+        languages = [""]
+    for language in languages:
+        match = re.search(rf"```{language}\s*\n?(.*?)\n?```", text, re.DOTALL)
+        if match:
+            return match.group(1).strip()
+    return ""
+
+
+def clean_formal_generation(generation: str, final_answer_key: str = "**FINAL ANSWER**") -> str:
+    # Extract part after **FINAL ANSWER** if present
+    if final_answer_key in generation:
+        generation = generation.split(final_answer_key, 1)[1].strip()
+
+    languages = ["lean4", "lean3", "lean", ""]
+    extracted_code = extract_code_block(generation, languages)
+    if extracted_code:
+        return extracted_code
+
+    # If no explicit code block, remove any surrounding triple backticks
+    return re.sub(r"^\s*```(?:lean4|lean3|lean)?\s*|\s*```[\s]*$", "", generation).strip()
--- a/nemo_skills/conversion/__init__.py
+++ b/nemo_skills/conversion/__init__.py
--- a/nemo_skills/conversion/hf_to_nemo_llama.py
+++ b/nemo_skills/conversion/hf_to_nemo_llama.py
--- a/nemo_skills/conversion/hf_to_nemo_qwen.py
+++ b/nemo_skills/conversion/hf_to_nemo_qwen.py
--- a/nemo_skills/conversion/hf_to_trtllm_quantize.py
+++ b/nemo_skills/conversion/hf_to_trtllm_quantize.py
--- a/nemo_skills/conversion/nemo_config_llama.yaml
+++ b/nemo_skills/conversion/nemo_config_llama.yaml
--- a/nemo_skills/conversion/nemo_config_qwen.yaml
+++ b/nemo_skills/conversion/nemo_config_qwen.yaml
--- a/nemo_skills/conversion/nemo_to_hf_llama.py
+++ b/nemo_skills/conversion/nemo_to_hf_llama.py
--- a/nemo_skills/conversion/nemo_to_hf_qwen.py
+++ b/nemo_skills/conversion/nemo_to_hf_qwen.py
--- a/nemo_skills/conversion/save_sharded_state.py
+++ b/nemo_skills/conversion/save_sharded_state.py
--- a/nemo_skills/dataset/__init__.py
+++ b/nemo_skills/dataset/__init__.py
--- a/nemo_skills/dataset/aai/__init__.py
+++ b/nemo_skills/dataset/aai/__init__.py
--- a/nemo_skills/dataset/aai/aai_score.py
+++ b/nemo_skills/dataset/aai/aai_score.py
--- a/nemo_skills/dataset/aai/prepare.py
+++ b/nemo_skills/dataset/aai/prepare.py
--- a/nemo_skills/dataset/aime24/__init__.py
+++ b/nemo_skills/dataset/aime24/__init__.py
--- a/nemo_skills/dataset/aime24/prepare.py
+++ b/nemo_skills/dataset/aime24/prepare.py
--- a/nemo_skills/dataset/aime24/test.txt
+++ b/nemo_skills/dataset/aime24/test.txt
--- a/nemo_skills/dataset/aime25/__init__.py
+++ b/nemo_skills/dataset/aime25/__init__.py
--- a/nemo_skills/dataset/aime25/prepare.py
+++ b/nemo_skills/dataset/aime25/prepare.py
--- a/nemo_skills/dataset/aime25/test.txt
+++ b/nemo_skills/dataset/aime25/test.txt
--- a/nemo_skills/dataset/algebra222/__init__.py
+++ b/nemo_skills/dataset/algebra222/__init__.py
--- a/nemo_skills/dataset/algebra222/prepare.py
+++ b/nemo_skills/dataset/algebra222/prepare.py
--- a/nemo_skills/dataset/amc23/__init__.py
+++ b/nemo_skills/dataset/amc23/__init__.py
--- a/nemo_skills/dataset/amc23/prepare.py
+++ b/nemo_skills/dataset/amc23/prepare.py
--- a/nemo_skills/dataset/answer-judge/__init__.py
+++ b/nemo_skills/dataset/answer-judge/__init__.py
--- a/nemo_skills/dataset/answer-judge/prepare.py
+++ b/nemo_skills/dataset/answer-judge/prepare.py
--- a/nemo_skills/dataset/arena-hard/__init__.py
+++ b/nemo_skills/dataset/arena-hard/__init__.py
--- a/nemo_skills/dataset/arena-hard/prepare.py
+++ b/nemo_skills/dataset/arena-hard/prepare.py
--- a/nemo_skills/dataset/asdiv/__init__.py
+++ b/nemo_skills/dataset/asdiv/__init__.py
--- a/nemo_skills/dataset/asdiv/prepare.py
+++ b/nemo_skills/dataset/asdiv/prepare.py
--- a/nemo_skills/dataset/bfcl_v3/__init__.py
+++ b/nemo_skills/dataset/bfcl_v3/__init__.py
--- a/nemo_skills/dataset/bfcl_v3/bfcl_score.py
+++ b/nemo_skills/dataset/bfcl_v3/bfcl_score.py
--- a/nemo_skills/dataset/bfcl_v3/constants.py
+++ b/nemo_skills/dataset/bfcl_v3/constants.py
--- a/nemo_skills/dataset/bfcl_v3/prepare.py
+++ b/nemo_skills/dataset/bfcl_v3/prepare.py
--- a/nemo_skills/dataset/bfcl_v3/utils.py
+++ b/nemo_skills/dataset/bfcl_v3/utils.py
--- a/nemo_skills/dataset/brumo25/__init__.py
+++ b/nemo_skills/dataset/brumo25/__init__.py
--- a/nemo_skills/dataset/brumo25/prepare.py
+++ b/nemo_skills/dataset/brumo25/prepare.py
--- a/nemo_skills/dataset/college_math/__init__.py
+++ b/nemo_skills/dataset/college_math/__init__.py
--- a/nemo_skills/dataset/college_math/prepare.py
+++ b/nemo_skills/dataset/college_math/prepare.py
--- a/nemo_skills/dataset/comp-math-24-25/__init__.py
+++ b/nemo_skills/dataset/comp-math-24-25/__init__.py
--- a/nemo_skills/dataset/comp-math-24-25/prepare.py
+++ b/nemo_skills/dataset/comp-math-24-25/prepare.py
--- a/nemo_skills/dataset/comp-math-24-25/test.txt
+++ b/nemo_skills/dataset/comp-math-24-25/test.txt
--- a/nemo_skills/dataset/gaokao2023en/__init__.py
+++ b/nemo_skills/dataset/gaokao2023en/__init__.py
--- a/nemo_skills/dataset/gaokao2023en/prepare.py
+++ b/nemo_skills/dataset/gaokao2023en/prepare.py
--- a/nemo_skills/dataset/gpqa/__init__.py
+++ b/nemo_skills/dataset/gpqa/__init__.py
--- a/nemo_skills/dataset/gpqa/prepare.py
+++ b/nemo_skills/dataset/gpqa/prepare.py
--- a/nemo_skills/dataset/gsm-plus/__init__.py
+++ b/nemo_skills/dataset/gsm-plus/__init__.py
--- a/nemo_skills/dataset/gsm-plus/prepare.py
+++ b/nemo_skills/dataset/gsm-plus/prepare.py
--- a/nemo_skills/dataset/gsm8k/__init__.py
+++ b/nemo_skills/dataset/gsm8k/__init__.py
--- a/nemo_skills/dataset/gsm8k/prepare.py
+++ b/nemo_skills/dataset/gsm8k/prepare.py
--- a/nemo_skills/dataset/hle/__init__.py
+++ b/nemo_skills/dataset/hle/__init__.py
--- a/nemo_skills/dataset/hle/prepare.py
+++ b/nemo_skills/dataset/hle/prepare.py
--- a/nemo_skills/dataset/hmmt_feb25/__init__.py
+++ b/nemo_skills/dataset/hmmt_feb25/__init__.py
--- a/nemo_skills/dataset/hmmt_feb25/prepare.py
+++ b/nemo_skills/dataset/hmmt_feb25/prepare.py
--- a/nemo_skills/dataset/human-eval/__init__.py
+++ b/nemo_skills/dataset/human-eval/__init__.py
--- a/nemo_skills/dataset/human-eval/prepare.py
+++ b/nemo_skills/dataset/human-eval/prepare.py
--- a/nemo_skills/dataset/ifbench/__init__.py
+++ b/nemo_skills/dataset/ifbench/__init__.py
--- a/nemo_skills/dataset/ifbench/prepare.py
+++ b/nemo_skills/dataset/ifbench/prepare.py
--- a/nemo_skills/dataset/ifeval/__init__.py
+++ b/nemo_skills/dataset/ifeval/__init__.py
--- a/nemo_skills/dataset/ifeval/prepare.py
+++ b/nemo_skills/dataset/ifeval/prepare.py
--- a/nemo_skills/dataset/livecodebench-pro/__init__.py
+++ b/nemo_skills/dataset/livecodebench-pro/__init__.py
--- a/nemo_skills/dataset/livecodebench-pro/prepare.py
+++ b/nemo_skills/dataset/livecodebench-pro/prepare.py
--- a/nemo_skills/dataset/livecodebench-x/__init__.py
+++ b/nemo_skills/dataset/livecodebench-x/__init__.py
--- a/nemo_skills/dataset/livecodebench-x/de/__init__.py
+++ b/nemo_skills/dataset/livecodebench-x/de/__init__.py
--- a/nemo_skills/dataset/livecodebench-x/en/__init__.py
+++ b/nemo_skills/dataset/livecodebench-x/en/__init__.py
--- a/nemo_skills/dataset/livecodebench-x/prepare.py
+++ b/nemo_skills/dataset/livecodebench-x/prepare.py
--- a/nemo_skills/dataset/livecodebench/__init__.py
+++ b/nemo_skills/dataset/livecodebench/__init__.py
--- a/nemo_skills/dataset/livecodebench/prepare.py
+++ b/nemo_skills/dataset/livecodebench/prepare.py
--- a/nemo_skills/dataset/math-500/__init__.py
+++ b/nemo_skills/dataset/math-500/__init__.py
--- a/nemo_skills/dataset/math-500/prepare.py
+++ b/nemo_skills/dataset/math-500/prepare.py
--- a/nemo_skills/dataset/math-odyssey/__init__.py
+++ b/nemo_skills/dataset/math-odyssey/__init__.py
--- a/nemo_skills/dataset/math-odyssey/prepare.py
+++ b/nemo_skills/dataset/math-odyssey/prepare.py
--- a/nemo_skills/dataset/math/__init__.py
+++ b/nemo_skills/dataset/math/__init__.py
--- a/nemo_skills/dataset/math/fix_ref_solns.py
+++ b/nemo_skills/dataset/math/fix_ref_solns.py
--- a/nemo_skills/dataset/math/prepare.py
+++ b/nemo_skills/dataset/math/prepare.py
--- a/nemo_skills/dataset/mawps/__init__.py
+++ b/nemo_skills/dataset/mawps/__init__.py
--- a/nemo_skills/dataset/mawps/prepare.py
+++ b/nemo_skills/dataset/mawps/prepare.py
--- a/nemo_skills/dataset/mbpp/__init__.py
+++ b/nemo_skills/dataset/mbpp/__init__.py
--- a/nemo_skills/dataset/mbpp/prepare.py
+++ b/nemo_skills/dataset/mbpp/prepare.py
--- a/nemo_skills/dataset/minerva_math/__init__.py
+++ b/nemo_skills/dataset/minerva_math/__init__.py
--- a/nemo_skills/dataset/minerva_math/prepare.py
+++ b/nemo_skills/dataset/minerva_math/prepare.py
--- a/nemo_skills/dataset/minif2f/__init__.py
+++ b/nemo_skills/dataset/minif2f/__init__.py
--- a/nemo_skills/dataset/minif2f/prepare.py
+++ b/nemo_skills/dataset/minif2f/prepare.py
--- a/nemo_skills/dataset/mmlu-pro/__init__.py
+++ b/nemo_skills/dataset/mmlu-pro/__init__.py
--- a/nemo_skills/dataset/mmlu-pro/prepare.py
+++ b/nemo_skills/dataset/mmlu-pro/prepare.py
--- a/nemo_skills/dataset/mmlu-redux/__init__.py
+++ b/nemo_skills/dataset/mmlu-redux/__init__.py
--- a/nemo_skills/dataset/mmlu-redux/prepare.py
+++ b/nemo_skills/dataset/mmlu-redux/prepare.py
--- a/nemo_skills/dataset/mmlu/__init__.py
+++ b/nemo_skills/dataset/mmlu/__init__.py
--- a/nemo_skills/dataset/mmlu/prepare.py
+++ b/nemo_skills/dataset/mmlu/prepare.py
--- a/nemo_skills/dataset/olympiadbench/__init__.py
+++ b/nemo_skills/dataset/olympiadbench/__init__.py
--- a/nemo_skills/dataset/olympiadbench/prepare.py
+++ b/nemo_skills/dataset/olympiadbench/prepare.py
--- a/nemo_skills/dataset/omni-math/__init__.py
+++ b/nemo_skills/dataset/omni-math/__init__.py
--- a/nemo_skills/dataset/omni-math/prepare.py
+++ b/nemo_skills/dataset/omni-math/prepare.py
--- a/nemo_skills/dataset/prepare.py
+++ b/nemo_skills/dataset/prepare.py
--- a/nemo_skills/dataset/proofnet/__init__.py
+++ b/nemo_skills/dataset/proofnet/__init__.py
--- a/nemo_skills/dataset/proofnet/prepare.py
+++ b/nemo_skills/dataset/proofnet/prepare.py
--- a/nemo_skills/dataset/putnam-bench/__init__.py
+++ b/nemo_skills/dataset/putnam-bench/__init__.py
--- a/nemo_skills/dataset/putnam-bench/prepare.py
+++ b/nemo_skills/dataset/putnam-bench/prepare.py
--- a/nemo_skills/dataset/ruler/__init__.py
+++ b/nemo_skills/dataset/ruler/__init__.py
--- a/nemo_skills/dataset/ruler/prepare.py
+++ b/nemo_skills/dataset/ruler/prepare.py
--- a/nemo_skills/dataset/ruler/ruler_score.py
+++ b/nemo_skills/dataset/ruler/ruler_score.py
--- a/nemo_skills/dataset/scicode/__init__.py
+++ b/nemo_skills/dataset/scicode/__init__.py
--- a/nemo_skills/dataset/scicode/prepare.py
+++ b/nemo_skills/dataset/scicode/prepare.py
--- a/nemo_skills/dataset/svamp/__init__.py
+++ b/nemo_skills/dataset/svamp/__init__.py
--- a/nemo_skills/dataset/svamp/prepare.py
+++ b/nemo_skills/dataset/svamp/prepare.py
--- a/nemo_skills/dataset/turing-eval/__init__.py
+++ b/nemo_skills/dataset/turing-eval/__init__.py
--- a/nemo_skills/dataset/turing-eval/prepare.py
+++ b/nemo_skills/dataset/turing-eval/prepare.py
--- a/nemo_skills/dataset/utils.py
+++ b/nemo_skills/dataset/utils.py
--- a/nemo_skills/evaluation/__init__.py
+++ b/nemo_skills/evaluation/__init__.py
--- a/nemo_skills/evaluation/aggregate_answers.py
+++ b/nemo_skills/evaluation/aggregate_answers.py
--- a/nemo_skills/evaluation/compute_group_score.py
+++ b/nemo_skills/evaluation/compute_group_score.py
--- a/nemo_skills/evaluation/evaluate_results.py
+++ b/nemo_skills/evaluation/evaluate_results.py
--- a/nemo_skills/evaluation/evaluator/__init__.py
+++ b/nemo_skills/evaluation/evaluator/__init__.py
--- a/nemo_skills/evaluation/evaluator/arena.py
+++ b/nemo_skills/evaluation/evaluator/arena.py
--- a/nemo_skills/evaluation/evaluator/bfcl.py
+++ b/nemo_skills/evaluation/evaluator/bfcl.py
--- a/nemo_skills/evaluation/evaluator/code.py
+++ b/nemo_skills/evaluation/evaluator/code.py
--- a/nemo_skills/evaluation/evaluator/ifbench.py
+++ b/nemo_skills/evaluation/evaluator/ifbench.py
--- a/nemo_skills/evaluation/evaluator/ifeval.py
+++ b/nemo_skills/evaluation/evaluator/ifeval.py
--- a/nemo_skills/evaluation/evaluator/math.py
+++ b/nemo_skills/evaluation/evaluator/math.py
--- a/nemo_skills/evaluation/evaluator/mcq.py
+++ b/nemo_skills/evaluation/evaluator/mcq.py
--- a/nemo_skills/evaluation/evaluator/ruler.py
+++ b/nemo_skills/evaluation/evaluator/ruler.py
--- a/nemo_skills/evaluation/evaluator/scicode.py
+++ b/nemo_skills/evaluation/evaluator/scicode.py
--- a/nemo_skills/evaluation/math_grader.py
+++ b/nemo_skills/evaluation/math_grader.py
--- a/nemo_skills/evaluation/metrics/__init__.py
+++ b/nemo_skills/evaluation/metrics/__init__.py
--- a/nemo_skills/evaluation/metrics/answer_judgement_metrics.py
+++ b/nemo_skills/evaluation/metrics/answer_judgement_metrics.py
--- a/nemo_skills/evaluation/metrics/arena_metrics.py
+++ b/nemo_skills/evaluation/metrics/arena_metrics.py
--- a/nemo_skills/evaluation/metrics/base.py
+++ b/nemo_skills/evaluation/metrics/base.py
--- a/nemo_skills/evaluation/metrics/bfcl_metrics.py
+++ b/nemo_skills/evaluation/metrics/bfcl_metrics.py
--- a/nemo_skills/evaluation/metrics/code_metrics.py
+++ b/nemo_skills/evaluation/metrics/code_metrics.py
--- a/nemo_skills/evaluation/metrics/compute_metrics.py
+++ b/nemo_skills/evaluation/metrics/compute_metrics.py
--- a/nemo_skills/evaluation/metrics/if_metrics.py
+++ b/nemo_skills/evaluation/metrics/if_metrics.py
--- a/nemo_skills/evaluation/metrics/lean4_metrics.py
+++ b/nemo_skills/evaluation/metrics/lean4_metrics.py
--- a/nemo_skills/evaluation/metrics/map_metrics.py
+++ b/nemo_skills/evaluation/metrics/map_metrics.py
--- a/nemo_skills/evaluation/metrics/math_metrics.py
+++ b/nemo_skills/evaluation/metrics/math_metrics.py
--- a/nemo_skills/evaluation/metrics/ruler_metrics.py
+++ b/nemo_skills/evaluation/metrics/ruler_metrics.py
--- a/nemo_skills/evaluation/metrics/utils.py
+++ b/nemo_skills/evaluation/metrics/utils.py
--- a/nemo_skills/evaluation/utils.py
+++ b/nemo_skills/evaluation/utils.py
--- a/nemo_skills/file_utils.py
+++ b/nemo_skills/file_utils.py
--- a/nemo_skills/inference/__init__.py
+++ b/nemo_skills/inference/__init__.py
--- a/nemo_skills/inference/chat_interface/__init__.py
+++ b/nemo_skills/inference/chat_interface/__init__.py
--- a/nemo_skills/inference/chat_interface/chat_service.py
+++ b/nemo_skills/inference/chat_interface/chat_service.py
--- a/nemo_skills/inference/chat_interface/core.py
+++ b/nemo_skills/inference/chat_interface/core.py
--- a/nemo_skills/inference/chat_interface/launch.py
+++ b/nemo_skills/inference/chat_interface/launch.py
--- a/nemo_skills/inference/chat_interface/ui.py
+++ b/nemo_skills/inference/chat_interface/ui.py
--- a/nemo_skills/inference/check_contamination.py
+++ b/nemo_skills/inference/check_contamination.py
--- a/nemo_skills/inference/eval/__init__.py
+++ b/nemo_skills/inference/eval/__init__.py
--- a/nemo_skills/inference/eval/arena_judge.py
+++ b/nemo_skills/inference/eval/arena_judge.py
--- a/nemo_skills/inference/eval/bfcl.py
+++ b/nemo_skills/inference/eval/bfcl.py
--- a/nemo_skills/inference/eval/bfcl_utils.py
+++ b/nemo_skills/inference/eval/bfcl_utils.py
--- a/nemo_skills/inference/eval/scicode.py
+++ b/nemo_skills/inference/eval/scicode.py
--- a/nemo_skills/inference/eval/scicode_utils.py
+++ b/nemo_skills/inference/eval/scicode_utils.py
--- a/nemo_skills/inference/eval/swebench.py
+++ b/nemo_skills/inference/eval/swebench.py
--- a/nemo_skills/inference/generate.py
+++ b/nemo_skills/inference/generate.py
--- a/nemo_skills/inference/genselect.py
+++ b/nemo_skills/inference/genselect.py
--- a/nemo_skills/inference/genselect_preprocess.py
+++ b/nemo_skills/inference/genselect_preprocess.py
--- a/nemo_skills/inference/llm_math_judge.py
+++ b/nemo_skills/inference/llm_math_judge.py
--- a/nemo_skills/inference/log_samples_wandb.py
+++ b/nemo_skills/inference/log_samples_wandb.py
--- a/nemo_skills/inference/merge_chunks.py
+++ b/nemo_skills/inference/merge_chunks.py
--- a/nemo_skills/inference/model/__init__.py
+++ b/nemo_skills/inference/model/__init__.py
--- a/nemo_skills/inference/model/azure.py
+++ b/nemo_skills/inference/model/azure.py
--- a/nemo_skills/inference/model/base.py
+++ b/nemo_skills/inference/model/base.py
--- a/nemo_skills/inference/model/code_execution.py
+++ b/nemo_skills/inference/model/code_execution.py
--- a/nemo_skills/inference/model/megatron.py
+++ b/nemo_skills/inference/model/megatron.py
--- a/nemo_skills/inference/model/openai.py
+++ b/nemo_skills/inference/model/openai.py
--- a/nemo_skills/inference/model/utils.py
+++ b/nemo_skills/inference/model/utils.py
--- a/nemo_skills/inference/model/vllm.py
+++ b/nemo_skills/inference/model/vllm.py
--- a/nemo_skills/inference/retrieve_similar.py
+++ b/nemo_skills/inference/retrieve_similar.py
--- a/nemo_skills/inference/server/__init__.py
+++ b/nemo_skills/inference/server/__init__.py
--- a/nemo_skills/inference/server/serve_sglang.py
+++ b/nemo_skills/inference/server/serve_sglang.py
--- a/nemo_skills/inference/server/serve_vllm.py
+++ b/nemo_skills/inference/server/serve_vllm.py
--- a/nemo_skills/pipeline/__init__.py
+++ b/nemo_skills/pipeline/__init__.py
--- a/nemo_skills/pipeline/app.py
+++ b/nemo_skills/pipeline/app.py
--- a/nemo_skills/pipeline/cli.py
+++ b/nemo_skills/pipeline/cli.py
--- a/nemo_skills/pipeline/convert.py
+++ b/nemo_skills/pipeline/convert.py
--- a/nemo_skills/pipeline/eval.py
+++ b/nemo_skills/pipeline/eval.py
--- a/nemo_skills/pipeline/generate.py
+++ b/nemo_skills/pipeline/generate.py
--- a/nemo_skills/pipeline/genselect.py
+++ b/nemo_skills/pipeline/genselect.py
--- a/nemo_skills/pipeline/nemo_rl/__init__.py
+++ b/nemo_skills/pipeline/nemo_rl/__init__.py
--- a/nemo_skills/pipeline/nemo_rl/grpo.py
+++ b/nemo_skills/pipeline/nemo_rl/grpo.py
--- a/nemo_skills/pipeline/nemo_rl/sft.py
+++ b/nemo_skills/pipeline/nemo_rl/sft.py
--- a/nemo_skills/pipeline/openrlhf/__init__.py
+++ b/nemo_skills/pipeline/openrlhf/__init__.py
--- a/nemo_skills/pipeline/openrlhf/ppo.py
+++ b/nemo_skills/pipeline/openrlhf/ppo.py
--- a/nemo_skills/pipeline/openrlhf/sft.py
+++ b/nemo_skills/pipeline/openrlhf/sft.py
--- a/nemo_skills/pipeline/prepare_data.py
+++ b/nemo_skills/pipeline/prepare_data.py
--- a/nemo_skills/pipeline/run_cmd.py
+++ b/nemo_skills/pipeline/run_cmd.py
--- a/nemo_skills/pipeline/setup.py
+++ b/nemo_skills/pipeline/setup.py
--- a/nemo_skills/pipeline/start_server.py
+++ b/nemo_skills/pipeline/start_server.py
--- a/nemo_skills/pipeline/summarize_results.py
+++ b/nemo_skills/pipeline/summarize_results.py
--- a/nemo_skills/pipeline/train.py
+++ b/nemo_skills/pipeline/train.py
--- a/nemo_skills/pipeline/utils/__init__.py
+++ b/nemo_skills/pipeline/utils/__init__.py
--- a/nemo_skills/pipeline/utils/cluster.py
+++ b/nemo_skills/pipeline/utils/cluster.py
--- a/nemo_skills/pipeline/utils/eval.py
+++ b/nemo_skills/pipeline/utils/eval.py
--- a/nemo_skills/pipeline/utils/exp.py
+++ b/nemo_skills/pipeline/utils/exp.py
--- a/nemo_skills/pipeline/utils/generation.py
+++ b/nemo_skills/pipeline/utils/generation.py
--- a/nemo_skills/pipeline/utils/mounts.py
+++ b/nemo_skills/pipeline/utils/mounts.py
--- a/nemo_skills/pipeline/utils/packager.py
+++ b/nemo_skills/pipeline/utils/packager.py
--- a/nemo_skills/pipeline/utils/server.py
+++ b/nemo_skills/pipeline/utils/server.py
--- a/nemo_skills/pipeline/verl/__init__.py
+++ b/nemo_skills/pipeline/verl/__init__.py
--- a/nemo_skills/pipeline/verl/ppo.py
+++ b/nemo_skills/pipeline/verl/ppo.py
--- a/nemo_skills/prompt/README.md
+++ b/nemo_skills/prompt/README.md
--- a/nemo_skills/prompt/__init__.py
+++ b/nemo_skills/prompt/__init__.py
--- a/nemo_skills/prompt/code_tags/__init__.py
+++ b/nemo_skills/prompt/code_tags/__init__.py
--- a/nemo_skills/prompt/code_tags/llama3.yaml
+++ b/nemo_skills/prompt/code_tags/llama3.yaml
--- a/nemo_skills/prompt/code_tags/nemotron.yaml
+++ b/nemo_skills/prompt/code_tags/nemotron.yaml
--- a/nemo_skills/prompt/code_tags/openmath.yaml
+++ b/nemo_skills/prompt/code_tags/openmath.yaml
--- a/nemo_skills/prompt/code_tags/qwen-lean.yaml
+++ b/nemo_skills/prompt/code_tags/qwen-lean.yaml
--- a/nemo_skills/prompt/code_tags/qwen.yaml
+++ b/nemo_skills/prompt/code_tags/qwen.yaml
--- a/nemo_skills/prompt/config/__init__.py
+++ b/nemo_skills/prompt/config/__init__.py
--- a/nemo_skills/prompt/config/eval/aai/livecodebench.yaml
+++ b/nemo_skills/prompt/config/eval/aai/livecodebench.yaml
--- a/nemo_skills/prompt/config/eval/aai/math.yaml
+++ b/nemo_skills/prompt/config/eval/aai/math.yaml
--- a/nemo_skills/prompt/config/eval/aai/mcq-10choices-boxed.yaml
+++ b/nemo_skills/prompt/config/eval/aai/mcq-10choices-boxed.yaml
--- a/nemo_skills/prompt/config/eval/aai/mcq-10choices.yaml
+++ b/nemo_skills/prompt/config/eval/aai/mcq-10choices.yaml
--- a/nemo_skills/prompt/config/eval/aai/mcq-4choices-boxed.yaml
+++ b/nemo_skills/prompt/config/eval/aai/mcq-4choices-boxed.yaml
--- a/nemo_skills/prompt/config/eval/aai/mcq-4choices.yaml
+++ b/nemo_skills/prompt/config/eval/aai/mcq-4choices.yaml
--- a/nemo_skills/prompt/config/eval/livecodebench/cpp_codegen.yaml
+++ b/nemo_skills/prompt/config/eval/livecodebench/cpp_codegen.yaml
--- a/nemo_skills/prompt/config/eval/livecodebench/cpp_codegen_reasoning.yaml
+++ b/nemo_skills/prompt/config/eval/livecodebench/cpp_codegen_reasoning.yaml
--- a/nemo_skills/prompt/config/eval/livecodebench/python_codegen.yaml
+++ b/nemo_skills/prompt/config/eval/livecodebench/python_codegen.yaml
--- a/nemo_skills/prompt/config/eval/livecodebench/python_codegen_reasoning.yaml
+++ b/nemo_skills/prompt/config/eval/livecodebench/python_codegen_reasoning.yaml
--- a/nemo_skills/prompt/config/eval/scicode/background.yaml
+++ b/nemo_skills/prompt/config/eval/scicode/background.yaml
--- a/nemo_skills/prompt/config/eval/scicode/default.yaml
+++ b/nemo_skills/prompt/config/eval/scicode/default.yaml
--- a/nemo_skills/prompt/config/generic/__init__.py
+++ b/nemo_skills/prompt/config/generic/__init__.py
--- a/nemo_skills/prompt/config/generic/codegen.yaml
+++ b/nemo_skills/prompt/config/generic/codegen.yaml
--- a/nemo_skills/prompt/config/generic/codegen_system.yaml
+++ b/nemo_skills/prompt/config/generic/codegen_system.yaml
--- a/nemo_skills/prompt/config/generic/default.yaml
+++ b/nemo_skills/prompt/config/generic/default.yaml
--- a/nemo_skills/prompt/config/generic/general-boxed.yaml
+++ b/nemo_skills/prompt/config/generic/general-boxed.yaml
--- a/nemo_skills/prompt/config/generic/hle.yaml
+++ b/nemo_skills/prompt/config/generic/hle.yaml
--- a/nemo_skills/prompt/config/generic/math-base.yaml
+++ b/nemo_skills/prompt/config/generic/math-base.yaml
--- a/nemo_skills/prompt/config/generic/math.yaml
+++ b/nemo_skills/prompt/config/generic/math.yaml
--- a/nemo_skills/prompt/config/generic/problem-augmentation-similar.yaml
+++ b/nemo_skills/prompt/config/generic/problem-augmentation-similar.yaml
--- a/nemo_skills/prompt/config/generic/problem-augmentation.yaml
+++ b/nemo_skills/prompt/config/generic/problem-augmentation.yaml
--- a/nemo_skills/prompt/config/judge/__init__.py
+++ b/nemo_skills/prompt/config/judge/__init__.py
--- a/nemo_skills/prompt/config/judge/arena.yaml
+++ b/nemo_skills/prompt/config/judge/arena.yaml
--- a/nemo_skills/prompt/config/judge/check-contamination.yaml
+++ b/nemo_skills/prompt/config/judge/check-contamination.yaml
--- a/nemo_skills/prompt/config/judge/code.yaml
+++ b/nemo_skills/prompt/config/judge/code.yaml
--- a/nemo_skills/prompt/config/judge/general-judge.yaml
+++ b/nemo_skills/prompt/config/judge/general-judge.yaml
--- a/nemo_skills/prompt/config/judge/hle.yaml
+++ b/nemo_skills/prompt/config/judge/hle.yaml
--- a/nemo_skills/prompt/config/judge/math-code.yaml
+++ b/nemo_skills/prompt/config/judge/math-code.yaml
--- a/nemo_skills/prompt/config/judge/math.yaml
+++ b/nemo_skills/prompt/config/judge/math.yaml
--- a/nemo_skills/prompt/config/judge/mt-bench/turn1.yaml
+++ b/nemo_skills/prompt/config/judge/mt-bench/turn1.yaml
--- a/nemo_skills/prompt/config/judge/mt-bench/turn1_with_ref.yaml
+++ b/nemo_skills/prompt/config/judge/mt-bench/turn1_with_ref.yaml
--- a/nemo_skills/prompt/config/judge/mt-bench/turn2.yaml
+++ b/nemo_skills/prompt/config/judge/mt-bench/turn2.yaml
--- a/nemo_skills/prompt/config/judge/mt-bench/turn2_with_ref.yaml
+++ b/nemo_skills/prompt/config/judge/mt-bench/turn2_with_ref.yaml
--- a/nemo_skills/prompt/config/lean4/formal-proof-deepseek-prover-v2.yaml
+++ b/nemo_skills/prompt/config/lean4/formal-proof-deepseek-prover-v2.yaml
--- a/nemo_skills/prompt/config/lean4/formal-proof-reasoning-execution.yaml
+++ b/nemo_skills/prompt/config/lean4/formal-proof-reasoning-execution.yaml
--- a/nemo_skills/prompt/config/lean4/formal-proof-reasoning.yaml
+++ b/nemo_skills/prompt/config/lean4/formal-proof-reasoning.yaml
--- a/nemo_skills/prompt/config/lean4/formal-proof.yaml
+++ b/nemo_skills/prompt/config/lean4/formal-proof.yaml
--- a/nemo_skills/prompt/config/lean4/nat-to-lean4.yaml
+++ b/nemo_skills/prompt/config/lean4/nat-to-lean4.yaml
--- a/nemo_skills/prompt/config/llama3-instruct/__init__.py
+++ b/nemo_skills/prompt/config/llama3-instruct/__init__.py
--- a/nemo_skills/prompt/config/llama3-instruct/gsm8k.yaml
+++ b/nemo_skills/prompt/config/llama3-instruct/gsm8k.yaml
--- a/nemo_skills/prompt/config/llama3-instruct/math.yaml
+++ b/nemo_skills/prompt/config/llama3-instruct/math.yaml
--- a/nemo_skills/prompt/config/llama3-instruct/mmlu.yaml
+++ b/nemo_skills/prompt/config/llama3-instruct/mmlu.yaml
--- a/nemo_skills/prompt/config/openmath/genselect.yaml
+++ b/nemo_skills/prompt/config/openmath/genselect.yaml
--- a/nemo_skills/prompt/config/openmath/tir.yaml
+++ b/nemo_skills/prompt/config/openmath/tir.yaml
--- a/nemo_skills/prompt/config/qwen/__init__.py
+++ b/nemo_skills/prompt/config/qwen/__init__.py
--- a/nemo_skills/prompt/config/qwen/math-cot.yaml
+++ b/nemo_skills/prompt/config/qwen/math-cot.yaml
--- a/nemo_skills/prompt/config/qwen/math-tir.yaml
+++ b/nemo_skills/prompt/config/qwen/math-tir.yaml
--- a/nemo_skills/prompt/config/qwen/qwq.yaml
+++ b/nemo_skills/prompt/config/qwen/qwq.yaml
--- a/nemo_skills/prompt/config/qwen3/math-cot-non-think.yaml
+++ b/nemo_skills/prompt/config/qwen3/math-cot-non-think.yaml
--- a/nemo_skills/prompt/config/qwen3/math-cot-think.yaml
+++ b/nemo_skills/prompt/config/qwen3/math-cot-think.yaml
--- a/nemo_skills/prompt/config/unit_test/code.yaml
+++ b/nemo_skills/prompt/config/unit_test/code.yaml
--- a/nemo_skills/prompt/few_shot_examples/__init__.py
+++ b/nemo_skills/prompt/few_shot_examples/__init__.py
--- a/nemo_skills/prompt/few_shot_examples/gsm8k.py
+++ b/nemo_skills/prompt/few_shot_examples/gsm8k.py
--- a/nemo_skills/prompt/few_shot_examples/lean4.py
+++ b/nemo_skills/prompt/few_shot_examples/lean4.py
--- a/nemo_skills/prompt/few_shot_examples/math.py
+++ b/nemo_skills/prompt/few_shot_examples/math.py
--- a/nemo_skills/prompt/few_shot_examples/mmlu.py
+++ b/nemo_skills/prompt/few_shot_examples/mmlu.py
--- a/nemo_skills/prompt/few_shot_examples/mmlu_pro.py
+++ b/nemo_skills/prompt/few_shot_examples/mmlu_pro.py
--- a/nemo_skills/prompt/template/__init__.py
+++ b/nemo_skills/prompt/template/__init__.py
--- a/nemo_skills/prompt/template/deepseek-instruct.yaml
+++ b/nemo_skills/prompt/template/deepseek-instruct.yaml
--- a/nemo_skills/prompt/template/deepseek-prover-v2.yaml
+++ b/nemo_skills/prompt/template/deepseek-prover-v2.yaml
--- a/nemo_skills/prompt/template/deepseek-prover.yaml
+++ b/nemo_skills/prompt/template/deepseek-prover.yaml
--- a/nemo_skills/prompt/template/default-base.yaml
+++ b/nemo_skills/prompt/template/default-base.yaml
--- a/nemo_skills/prompt/template/llama3-base.yaml
+++ b/nemo_skills/prompt/template/llama3-base.yaml
--- a/nemo_skills/prompt/template/llama3-instruct-nosys.yaml
+++ b/nemo_skills/prompt/template/llama3-instruct-nosys.yaml
--- a/nemo_skills/prompt/template/llama3-instruct.yaml
+++ b/nemo_skills/prompt/template/llama3-instruct.yaml
--- a/nemo_skills/prompt/template/mistral-instruct.yaml
+++ b/nemo_skills/prompt/template/mistral-instruct.yaml
--- a/nemo_skills/prompt/template/nemotron-instruct.yaml
+++ b/nemo_skills/prompt/template/nemotron-instruct.yaml
--- a/nemo_skills/prompt/template/qwen-instruct.yaml
+++ b/nemo_skills/prompt/template/qwen-instruct.yaml
--- a/nemo_skills/prompt/utils.py
+++ b/nemo_skills/prompt/utils.py
--- a/nemo_skills/training/__init__.py
+++ b/nemo_skills/training/__init__.py
--- a/nemo_skills/training/average_checkpoints.py
+++ b/nemo_skills/training/average_checkpoints.py
--- a/nemo_skills/training/copy_checkpoint.py
+++ b/nemo_skills/training/copy_checkpoint.py
--- a/nemo_skills/training/data_preparation_utils/__init__.py
+++ b/nemo_skills/training/data_preparation_utils/__init__.py
--- a/nemo_skills/training/data_preparation_utils/arithmetic_utils.py
+++ b/nemo_skills/training/data_preparation_utils/arithmetic_utils.py
--- a/nemo_skills/training/data_preparation_utils/config/code_sft.yaml
+++ b/nemo_skills/training/data_preparation_utils/config/code_sft.yaml
--- a/nemo_skills/training/data_preparation_utils/config/math_rl.yaml
+++ b/nemo_skills/training/data_preparation_utils/config/math_rl.yaml
--- a/nemo_skills/training/data_preparation_utils/config/math_sft.yaml
+++ b/nemo_skills/training/data_preparation_utils/config/math_sft.yaml
--- a/nemo_skills/training/data_preparation_utils/filters.py
+++ b/nemo_skills/training/data_preparation_utils/filters.py
--- a/nemo_skills/training/data_preparation_utils/merge_processor.py
+++ b/nemo_skills/training/data_preparation_utils/merge_processor.py
--- a/nemo_skills/training/data_preparation_utils/preprocessing.py
+++ b/nemo_skills/training/data_preparation_utils/preprocessing.py
--- a/nemo_skills/training/dpo_config.yaml
+++ b/nemo_skills/training/dpo_config.yaml
--- a/nemo_skills/training/gpt_sft_dataset.py
+++ b/nemo_skills/training/gpt_sft_dataset.py
--- a/nemo_skills/training/grpo_config.yaml
+++ b/nemo_skills/training/grpo_config.yaml
--- a/nemo_skills/training/merge_packed_data.py
+++ b/nemo_skills/training/merge_packed_data.py
--- a/nemo_skills/training/nemo_rl/__init__.py
+++ b/nemo_skills/training/nemo_rl/__init__.py
--- a/nemo_skills/training/nemo_rl/configs/grpo.yaml
+++ b/nemo_skills/training/nemo_rl/configs/grpo.yaml
--- a/nemo_skills/training/nemo_rl/configs/sft.yaml
+++ b/nemo_skills/training/nemo_rl/configs/sft.yaml
--- a/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py
+++ b/nemo_skills/training/nemo_rl/convert_dcp_to_hf.py
--- a/nemo_skills/training/nemo_rl/environments/__init__.py
+++ b/nemo_skills/training/nemo_rl/environments/__init__.py
--- a/nemo_skills/training/nemo_rl/environments/math_environment.py
+++ b/nemo_skills/training/nemo_rl/environments/math_environment.py
--- a/nemo_skills/training/nemo_rl/prompts/cot.txt
+++ b/nemo_skills/training/nemo_rl/prompts/cot.txt
--- a/nemo_skills/training/nemo_rl/prompts/math.txt
+++ b/nemo_skills/training/nemo_rl/prompts/math.txt
--- a/nemo_skills/training/nemo_rl/start_grpo.py
+++ b/nemo_skills/training/nemo_rl/start_grpo.py
--- a/nemo_skills/training/nemo_rl/start_sft.py
+++ b/nemo_skills/training/nemo_rl/start_sft.py
--- a/nemo_skills/training/openrlhf/__init__.py
+++ b/nemo_skills/training/openrlhf/__init__.py
--- a/nemo_skills/training/openrlhf/math_reward.py
+++ b/nemo_skills/training/openrlhf/math_reward.py
--- a/nemo_skills/training/pack_config.yaml
+++ b/nemo_skills/training/pack_config.yaml
--- a/nemo_skills/training/prepare_data.py
+++ b/nemo_skills/training/prepare_data.py
--- a/nemo_skills/training/prepare_packed_ft_dataset.py
+++ b/nemo_skills/training/prepare_packed_ft_dataset.py
--- a/nemo_skills/training/rm_config.yaml
+++ b/nemo_skills/training/rm_config.yaml
--- a/nemo_skills/training/sft_config.yaml
+++ b/nemo_skills/training/sft_config.yaml
--- a/nemo_skills/training/start_dpo.py
+++ b/nemo_skills/training/start_dpo.py
--- a/nemo_skills/training/start_grpo.py
+++ b/nemo_skills/training/start_grpo.py
--- a/nemo_skills/training/start_rm.py
+++ b/nemo_skills/training/start_rm.py
--- a/nemo_skills/training/start_sft.py
+++ b/nemo_skills/training/start_sft.py
--- a/nemo_skills/training/train_redrafter.py
+++ b/nemo_skills/training/train_redrafter.py
--- a/nemo_skills/training/verl/__init__.py
+++ b/nemo_skills/training/verl/__init__.py
--- a/nemo_skills/training/verl/prepare_data.py
+++ b/nemo_skills/training/verl/prepare_data.py
--- a/nemo_skills/utils.py
+++ b/nemo_skills/utils.py
--- a/nemo_skills/version.py
+++ b/nemo_skills/version.py
--- a/pyproject.toml
+++ b/pyproject.toml
--- a/recipes/README.md
+++ b/recipes/README.md
--- a/recipes/opencodereasoning/configs/solution_sdg/demo.yaml
+++ b/recipes/opencodereasoning/configs/solution_sdg/demo.yaml
--- a/recipes/opencodereasoning/configs/solution_sdg/r1.yaml
+++ b/recipes/opencodereasoning/configs/solution_sdg/r1.yaml
--- a/recipes/opencodereasoning/pipeline/prepare_questions.py
+++ b/recipes/opencodereasoning/pipeline/prepare_questions.py
--- a/recipes/opencodereasoning/pipeline/prepare_solutions.py
+++ b/recipes/opencodereasoning/pipeline/prepare_solutions.py
--- a/recipes/opencodereasoning/prompts/generate_cpp_soln.yaml
+++ b/recipes/opencodereasoning/prompts/generate_cpp_soln.yaml
--- a/recipes/opencodereasoning/prompts/generate_python_soln.yaml
+++ b/recipes/opencodereasoning/prompts/generate_python_soln.yaml
--- a/recipes/opencodereasoning/scripts/filter_questions.py
+++ b/recipes/opencodereasoning/scripts/filter_questions.py
--- a/recipes/opencodereasoning/scripts/functional_helpers.py
+++ b/recipes/opencodereasoning/scripts/functional_helpers.py
--- a/recipes/opencodereasoning/scripts/output_processing.py
+++ b/recipes/opencodereasoning/scripts/output_processing.py
--- a/recipes/opencodereasoning/scripts/prepare_questions.py
+++ b/recipes/opencodereasoning/scripts/prepare_questions.py
--- a/recipes/openmathreasoning.pdf
+++ b/recipes/openmathreasoning.pdf
--- a/recipes/openmathreasoning/configs/genselect_sdg/qwq.yaml
+++ b/recipes/openmathreasoning/configs/genselect_sdg/qwq.yaml
--- a/recipes/openmathreasoning/configs/problem_sdg/demo.yaml
+++ b/recipes/openmathreasoning/configs/problem_sdg/demo.yaml
--- a/recipes/openmathreasoning/configs/problem_sdg/example-data.txt
+++ b/recipes/openmathreasoning/configs/problem_sdg/example-data.txt
--- a/recipes/openmathreasoning/configs/problem_sdg/qwen-instruct.yaml
+++ b/recipes/openmathreasoning/configs/problem_sdg/qwen-instruct.yaml
--- a/recipes/openmathreasoning/configs/solution_sdg/demo.yaml
+++ b/recipes/openmathreasoning/configs/solution_sdg/demo.yaml
--- a/recipes/openmathreasoning/configs/solution_sdg/qwq.yaml
+++ b/recipes/openmathreasoning/configs/solution_sdg/qwq.yaml
--- a/recipes/openmathreasoning/configs/solution_sdg/r1.yaml
+++ b/recipes/openmathreasoning/configs/solution_sdg/r1.yaml
--- a/recipes/openmathreasoning/configs/solution_sdg/tir-limo.yaml
+++ b/recipes/openmathreasoning/configs/solution_sdg/tir-limo.yaml
--- a/recipes/openmathreasoning/configs/solution_sdg/tir-openmath.yaml
+++ b/recipes/openmathreasoning/configs/solution_sdg/tir-openmath.yaml
--- a/recipes/openmathreasoning/pipeline/genselect_generation.py
+++ b/recipes/openmathreasoning/pipeline/genselect_generation.py
--- a/recipes/openmathreasoning/pipeline/problem_generation.py
+++ b/recipes/openmathreasoning/pipeline/problem_generation.py
--- a/recipes/openmathreasoning/pipeline/solution_generation.py
+++ b/recipes/openmathreasoning/pipeline/solution_generation.py
--- a/recipes/openmathreasoning/prompts/classify-if-binary.yaml
+++ b/recipes/openmathreasoning/prompts/classify-if-binary.yaml
--- a/recipes/openmathreasoning/prompts/classify-if-invalid.yaml
+++ b/recipes/openmathreasoning/prompts/classify-if-invalid.yaml
--- a/recipes/openmathreasoning/prompts/classify-if-mcq.yaml
+++ b/recipes/openmathreasoning/prompts/classify-if-mcq.yaml
--- a/recipes/openmathreasoning/prompts/classify-if-proof.yaml
+++ b/recipes/openmathreasoning/prompts/classify-if-proof.yaml
--- a/recipes/openmathreasoning/prompts/classify-tir-novelty.yaml
+++ b/recipes/openmathreasoning/prompts/classify-tir-novelty.yaml
--- a/recipes/openmathreasoning/prompts/classify-tir-significance.yaml
+++ b/recipes/openmathreasoning/prompts/classify-tir-significance.yaml
--- a/recipes/openmathreasoning/prompts/convert-proofs.yaml
+++ b/recipes/openmathreasoning/prompts/convert-proofs.yaml
--- a/recipes/openmathreasoning/prompts/extract-answers.yaml
+++ b/recipes/openmathreasoning/prompts/extract-answers.yaml
--- a/recipes/openmathreasoning/prompts/extract-problems.yaml
+++ b/recipes/openmathreasoning/prompts/extract-problems.yaml
--- a/recipes/openmathreasoning/prompts/math-tir-detailed.yaml
+++ b/recipes/openmathreasoning/prompts/math-tir-detailed.yaml
--- a/recipes/openmathreasoning/prompts/summarize-genselect.yaml
+++ b/recipes/openmathreasoning/prompts/summarize-genselect.yaml
--- a/recipes/openmathreasoning/prompts/summarize-solution.yaml
+++ b/recipes/openmathreasoning/prompts/summarize-solution.yaml
--- a/recipes/openmathreasoning/scripts/extract_python_fragments.py
+++ b/recipes/openmathreasoning/scripts/extract_python_fragments.py
--- a/recipes/openmathreasoning/scripts/filter_novelty_significance.py
+++ b/recipes/openmathreasoning/scripts/filter_novelty_significance.py
--- a/recipes/openmathreasoning/scripts/genselect/extract_judgment.py
+++ b/recipes/openmathreasoning/scripts/genselect/extract_judgment.py
--- a/recipes/openmathreasoning/scripts/genselect/merge_new_summary.py
+++ b/recipes/openmathreasoning/scripts/genselect/merge_new_summary.py
--- a/recipes/openmathreasoning/scripts/genselect/prepare_labeling_data.py
+++ b/recipes/openmathreasoning/scripts/genselect/prepare_labeling_data.py
--- a/recipes/openmathreasoning/scripts/genselect/utils.py
+++ b/recipes/openmathreasoning/scripts/genselect/utils.py
--- a/recipes/openmathreasoning/scripts/merge_new_summary.py
+++ b/recipes/openmathreasoning/scripts/merge_new_summary.py
--- a/recipes/openmathreasoning/scripts/postprocess_answer_extraction.py
+++ b/recipes/openmathreasoning/scripts/postprocess_answer_extraction.py
--- a/recipes/openmathreasoning/scripts/postprocess_classification.py
+++ b/recipes/openmathreasoning/scripts/postprocess_classification.py
--- a/recipes/openmathreasoning/scripts/postprocess_problem_extraction.py
+++ b/recipes/openmathreasoning/scripts/postprocess_problem_extraction.py
--- a/recipes/openmathreasoning/scripts/postprocess_proof_conversion.py
+++ b/recipes/openmathreasoning/scripts/postprocess_proof_conversion.py
--- a/recipes/openmathreasoning/scripts/postprocess_tir_generations.py
+++ b/recipes/openmathreasoning/scripts/postprocess_tir_generations.py
--- a/recipes/openmathreasoning/scripts/prepare_raw_data.py
+++ b/recipes/openmathreasoning/scripts/prepare_raw_data.py
--- a/recipes/openmathreasoning/scripts/simplified_recipe.py
+++ b/recipes/openmathreasoning/scripts/simplified_recipe.py
--- a/recipes/openreasoning/eval.py
+++ b/recipes/openreasoning/eval.py
--- a/recipes/openreasoning/prompts/science_question_augmentation_prompt.yaml
+++ b/recipes/openreasoning/prompts/science_question_augmentation_prompt.yaml
--- a/recipes/openreasoning/prompts/science_question_generation_prompt.yaml
+++ b/recipes/openreasoning/prompts/science_question_generation_prompt.yaml
--- a/recipes/openreasoning/scripts/use_majority_if_no_answer.py
+++ b/recipes/openreasoning/scripts/use_majority_if_no_answer.py
--- a/requirements/code_execution.txt
+++ b/requirements/code_execution.txt
--- a/requirements/common-tests.txt
+++ b/requirements/common-tests.txt
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
--- a/requirements/main.txt
+++ b/requirements/main.txt
--- a/tests/__init__.py
+++ b/tests/__init__.py
--- a/tests/conftest.py
+++ b/tests/conftest.py
--- a/tests/data/code-output.test
+++ b/tests/data/code-output.test
--- a/tests/data/contamination-example.test
+++ b/tests/data/contamination-example.test
--- a/tests/data/eval_outputs/eval-results/answer-judge/output-rs0.jsonl-test
+++ b/tests/data/eval_outputs/eval-results/answer-judge/output-rs0.jsonl-test
--- a/tests/data/eval_outputs/eval-results/answer-judge/output-rs1.jsonl-test
+++ b/tests/data/eval_outputs/eval-results/answer-judge/output-rs1.jsonl-test
--- a/tests/data/eval_outputs/eval-results/answer-judge/output-rs2.jsonl-test
+++ b/tests/data/eval_outputs/eval-results/answer-judge/output-rs2.jsonl-test
--- a/tests/data/eval_outputs/eval-results/answer-judge/output-rs3.jsonl-test
+++ b/tests/data/eval_outputs/eval-results/answer-judge/output-rs3.jsonl-test
--- a/tests/data/eval_outputs/eval-results/arena-hard/output.jsonl-test
+++ b/tests/data/eval_outputs/eval-results/arena-hard/output.jsonl-test
--- a/tests/data/eval_outputs/eval-results/gpqa/output-rs0.jsonl-test
+++ b/tests/data/eval_outputs/eval-results/gpqa/output-rs0.jsonl-test
--- a/tests/data/eval_outputs/eval-results/gpqa/output-rs1.jsonl-test
+++ b/tests/data/eval_outputs/eval-results/gpqa/output-rs1.jsonl-test
--- a/tests/data/eval_outputs/eval-results/gpqa/output-rs2.jsonl-test
+++ b/tests/data/eval_outputs/eval-results/gpqa/output-rs2.jsonl-test
--- a/tests/data/eval_outputs/eval-results/gpqa/output-rs3.jsonl-test
+++ b/tests/data/eval_outputs/eval-results/gpqa/output-rs3.jsonl-test
--- a/tests/data/eval_outputs/eval-results/human-eval/output-rs0.jsonl-test
+++ b/tests/data/eval_outputs/eval-results/human-eval/output-rs0.jsonl-test
--- a/tests/data/eval_outputs/eval-results/human-eval/output-rs1.jsonl-test
+++ b/tests/data/eval_outputs/eval-results/human-eval/output-rs1.jsonl-test
--- a/tests/data/eval_outputs/eval-results/ifeval/output-rs0.jsonl-test
+++ b/tests/data/eval_outputs/eval-results/ifeval/output-rs0.jsonl-test
--- a/tests/data/eval_outputs/eval-results/ifeval/output-rs1.jsonl-test
+++ b/tests/data/eval_outputs/eval-results/ifeval/output-rs1.jsonl-test
--- a/tests/data/eval_outputs/eval-results/ifeval/output-rs2.jsonl-test
+++ b/tests/data/eval_outputs/eval-results/ifeval/output-rs2.jsonl-test
--- a/tests/data/eval_outputs/eval-results/math/output-rs0.jsonl-test
+++ b/tests/data/eval_outputs/eval-results/math/output-rs0.jsonl-test
--- a/tests/data/eval_outputs/eval-results/math/output-rs1.jsonl-test
+++ b/tests/data/eval_outputs/eval-results/math/output-rs1.jsonl-test
--- a/tests/data/eval_outputs/eval-results/math/output-rs2.jsonl-test
+++ b/tests/data/eval_outputs/eval-results/math/output-rs2.jsonl-test
--- a/tests/data/eval_outputs/eval-results/metrics-ms8192.json-test
+++ b/tests/data/eval_outputs/eval-results/metrics-ms8192.json-test
--- a/tests/data/eval_outputs/eval-results/metrics.json-test
+++ b/tests/data/eval_outputs/eval-results/metrics.json-test
--- a/tests/data/eval_outputs/eval-results/minif2f/output-rs0.jsonl-test
+++ b/tests/data/eval_outputs/eval-results/minif2f/output-rs0.jsonl-test
--- a/tests/data/eval_outputs/eval-results/minif2f/output-rs1.jsonl-test
+++ b/tests/data/eval_outputs/eval-results/minif2f/output-rs1.jsonl-test
--- a/tests/data/eval_outputs/eval-results/minif2f/output-rs2.jsonl-test
+++ b/tests/data/eval_outputs/eval-results/minif2f/output-rs2.jsonl-test
--- a/tests/data/eval_outputs/eval-results/minif2f/output-rs3.jsonl-test
+++ b/tests/data/eval_outputs/eval-results/minif2f/output-rs3.jsonl-test
--- a/tests/data/eval_outputs/summarize_results_output-ms8192.txt
+++ b/tests/data/eval_outputs/summarize_results_output-ms8192.txt
--- a/tests/data/eval_outputs/summarize_results_output.txt
+++ b/tests/data/eval_outputs/summarize_results_output.txt
--- a/tests/data/openai-input-dict.test
+++ b/tests/data/openai-input-dict.test
--- a/tests/data/openai-input-list.test
+++ b/tests/data/openai-input-list.test
--- a/tests/data/openmathinstruct2.test
+++ b/tests/data/openmathinstruct2.test
--- a/tests/data/output-rs0.test
+++ b/tests/data/output-rs0.test
--- a/tests/data/output-rs1.test
+++ b/tests/data/output-rs1.test
--- a/tests/data/output-rs2.test
+++ b/tests/data/output-rs2.test
--- a/tests/data/small-grpo-data.test
+++ b/tests/data/small-grpo-data.test
--- a/tests/data/small-sft-data.test
+++ b/tests/data/small-sft-data.test
--- a/tests/gpu-tests/__init__.py
+++ b/tests/gpu-tests/__init__.py
--- a/tests/gpu-tests/make_tiny_llm.py
+++ b/tests/gpu-tests/make_tiny_llm.py
--- a/tests/gpu-tests/run_llama.sh
+++ b/tests/gpu-tests/run_llama.sh
--- a/tests/gpu-tests/run_qwen.sh
+++ b/tests/gpu-tests/run_qwen.sh
--- a/tests/gpu-tests/test-local.yaml
+++ b/tests/gpu-tests/test-local.yaml
--- a/tests/gpu-tests/test_contamination.py
+++ b/tests/gpu-tests/test_contamination.py
--- a/tests/gpu-tests/test_convert.py
+++ b/tests/gpu-tests/test_convert.py
--- a/tests/gpu-tests/test_eval.py
+++ b/tests/gpu-tests/test_eval.py
--- a/tests/gpu-tests/test_generate.py
+++ b/tests/gpu-tests/test_generate.py
--- a/tests/gpu-tests/test_judge.py
+++ b/tests/gpu-tests/test_judge.py
--- a/tests/gpu-tests/test_run_cmd_llm_infer.py
+++ b/tests/gpu-tests/test_run_cmd_llm_infer.py
--- a/tests/gpu-tests/test_train.py
+++ b/tests/gpu-tests/test_train.py
--- a/tests/scripts/run_cmd_llm_infer_check.py
+++ b/tests/scripts/run_cmd_llm_infer_check.py
--- a/tests/test_code_execution.py
+++ b/tests/test_code_execution.py
--- a/tests/test_configs.py
+++ b/tests/test_configs.py
--- a/tests/test_data_preparation.py
+++ b/tests/test_data_preparation.py
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
--- a/tests/test_default_args.py
+++ b/tests/test_default_args.py
--- a/tests/test_generation.py
+++ b/tests/test_generation.py
--- a/tests/test_math_equal.py
+++ b/tests/test_math_equal.py
--- a/tests/test_metrics.py
+++ b/tests/test_metrics.py
--- a/tests/test_prompts.py
+++ b/tests/test_prompts.py