Commit e4d72d72 by Shi wenxuan

Initial commit

parents
name: Build docs
on:
push:
branches: ["main"]
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:
permissions:
contents: read
pages: write
id-token: write
# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
concurrency:
group: "pages"
cancel-in-progress: false
jobs:
# Build docs and deploy to the website
deploy:
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Configure Git Credentials
run: |
git config user.name github-actions[bot]
git config user.email 41898282+github-actions[bot]@users.noreply.github.com
- uses: actions/setup-python@v5
with:
python-version: 3.x
- run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
- uses: actions/cache@v4
with:
key: mkdocs-material-${{ env.cache_id }}
path: .cache
restore-keys: |
mkdocs-material-
- run: pip install -r requirements/docs.txt
- run: mkdocs build
- name: Upload artifact
uses: actions/upload-pages-artifact@v3
with:
path: 'site'
- name: Deploy to GitHub Pages
id: deployment
uses: actions/deploy-pages@v4
\ No newline at end of file
name: Integration tests
on:
pull_request:
branches: [ "main" ]
types: [opened, synchronize, reopened, labeled]
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:
permissions:
contents: read
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
gpu-tests-llama:
runs-on: self-hosted-nemo-gpus-1
if: ${{ github.event.label.name == 'run GPU tests' }}
steps:
- name: Cleanup old containers
run: |
docker system prune --all --filter "until=360h" --force
- uses: actions/checkout@v3
with:
path: ${{ github.run_id }}
- name: Set up Python 3.10
uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Install dependencies
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
cd ${{ github.run_id }}
python -m pip install --upgrade pip
pip uninstall -y nemo-skills nemo_run
pip install -e .
pip install -r requirements/common-tests.txt
ns prepare_data gsm8k human-eval mbpp algebra222 mmlu ifeval math-500 amc23 aime24
- name: Run GPU tests
timeout-minutes: 180
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
cd ${{ github.run_id }}
nvidia-smi
set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
./tests/gpu-tests/run_llama.sh
- name: Cleanup
if: always()
run: |
docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.6.1 bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/'
docker ps -a -q | xargs -r docker stop
gpu-tests-qwen:
runs-on: self-hosted-nemo-gpus-1
if: ${{ github.event.label.name == 'run GPU tests' }}
steps:
- name: Cleanup old containers
run: |
docker system prune --all --filter "until=360h" --force
- uses: actions/checkout@v3
with:
path: ${{ github.run_id }}
- name: Set up Python 3.10
uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Install dependencies
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
cd ${{ github.run_id }}
python -m pip install --upgrade pip
pip uninstall -y nemo-skills nemo_run
pip install -e .
pip install -r requirements/common-tests.txt
ns prepare_data gsm8k human-eval mbpp algebra222 mmlu ifeval math-500 amc23 aime24
- name: Run GPU tests
timeout-minutes: 180
env:
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
cd ${{ github.run_id }}
nvidia-smi
set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
./tests/gpu-tests/run_qwen.sh
- name: Cleanup
if: always()
run: |
docker run --rm -v /tmp:/tmp -v /home:/home igitman/nemo-skills:0.6.1 bash -c 'rm -rf /tmp/nemo-skills-tests /home/azureuser/.nemo_run/'
docker ps -a -q | xargs -r docker stop
name: CPU tests
on:
pull_request:
branches: [ "main" ]
# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:
permissions:
contents: read
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
unit-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up Python 3.10
uses: actions/setup-python@v3
with:
python-version: "3.10"
- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -e .
pip install -r requirements/common-tests.txt
- name: Run all tests
env:
NVIDIA_API_KEY: ${{ secrets.NVIDIA_API_KEY }}
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
docker pull igitman/nemo-skills:0.6.1
docker run --rm --name=local-sandbox igitman/nemo-skills-sandbox:0.6.1 &
sleep 120
export NEMO_SKILLS_SANDBOX_HOST=`docker inspect -f '{{range .NetworkSettings.Networks}}{{.IPAddress}}{{end}}' local-sandbox`
set -o pipefail # this will make sure next line returns non-0 exit code if tests fail
ns prepare_data gsm8k math-500
python -m pytest tests/ -m "not gpu" --junitxml=pytest.xml --cov-report=term-missing:skip-covered --cov=nemo_skills --cov=pipeline --durations=30 -rs -s -vvv
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
*.json
*.tar.gz
*.tar
*.npy
*.info
*.jsonl
*.csv
nemo_experiments
wandb
build
.hypothesis
*.zip
*.egg-info
*.xml
*.DS_Store
.coverage
.venv
*.lock
__pycache__
.ipynb_checkpoints
cluster_configs/*
!cluster_configs/example-*.yaml
nemo_skills/dataset/ruler/*/
nemo_skills/dataset/bfcl_v3/*/
.idea/
.idea/*
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
default_language_version:
python: python3
ci:
autofix_prs: true
autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
autoupdate_schedule: quarterly
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: check-yaml
exclude: ^mkdocs\.yml$
- id: check-case-conflict
- id: detect-private-key
- id: check-added-large-files
args: ['--maxkb=1000']
- id: requirements-txt-fixer
- repo: https://github.com/PyCQA/isort
rev: 5.13.2
hooks:
- id: isort
name: Format imports
exclude: docs/
args: ["--profile", "black"]
- repo: https://github.com/psf/black
rev: 24.10.0
hooks:
- id: black
name: Format code
exclude: docs/source-app
# Contributing To Nemo-Skills
Thanks for your interest in contributing to Nemo-Skills!
## Running Tests
TBD
## Code Quality
- Follow the existing code style and conventions
- Write tests for new features
- Update documentation to reflect your changes
- Ensure all tests pass before submitting a PR
- Do not add arbitrary defaults for configs, be as explicit as possible.
## Signing Your Work
* We require that all contributors "sign-off" on their commits. This certifies that the contribution is your original work, or you have rights to submit it under the same license, or a compatible license.
* Any contribution which contains commits that are not Signed-Off will not be accepted.
* To sign off on a commit you simply use the `--signoff` (or `-s`) option when committing your changes:
```bash
$ git commit -s -m "Add cool feature."
```
This will append the following to your commit message:
```
Signed-off-by: Your Name <your@email.com>
```
* Full text of the DCO:
```
Developer Certificate of Origin
Version 1.1
Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
Everyone is permitted to copy and distribute verbatim copies of this
license document, but changing it is not allowed.
Developer's Certificate of Origin 1.1
By making a contribution to this project, I certify that:
(a) The contribution was created in whole or in part by me and I
have the right to submit it under the open source license
indicated in the file; or
(b) The contribution is based upon previous work that, to the best
of my knowledge, is covered under an appropriate open source
license and I have the right under that license to submit that
work with modifications, whether created in whole or in part
by me, under the same open source license (unless I am
permitted to submit under a different license), as indicated
in the file; or
(c) The contribution was provided directly to me by some other
person who certified (a), (b) or (c) and I have not modified
it.
(d) I understand and agree that this project and the contribution
are public and that a record of the contribution (including all
personal information I submit with it, including my sign-off) is
maintained indefinitely and may be redistributed consistent with
this project or the open source license(s) involved.
```
\ No newline at end of file
This diff is collapsed. Click to expand it.
recursive-include nemo_skills *.yaml
recursive-include nemo_skills *.txt
\ No newline at end of file
# NeMo Skills
NeMo-Skills is a collection of pipelines to improve "skills" of large language models (LLMs). We support everything needed for LLM development, from synthetic data generation, to model training, to evaluation on a wide range of benchmarks. Start developing on a local workstation and move to a large-scale Slurm cluster with just a one-line change.
Here are some of the features we support:
- [Flexible LLM inference](https://nvidia.github.io/NeMo-Skills/pipelines/generation/):
- Seamlessly switch between API providers, local server and large-scale slurm jobs for LLM inference.
- Host models (on 1 or many nodes) with [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [vLLM](https://github.com/vllm-project/vllm), [sglang](https://github.com/sgl-project/sglang) or [Megatron](https://github.com/NVIDIA/Megatron-LM).
- Scale SDG jobs from 1 GPU on a local machine all the way to tens of thousands of GPUs on a slurm cluster.
- [Model evaluation](https://nvidia.github.io/NeMo-Skills/pipelines/evaluation):
- Evaluate your models on many popular benchmarks.
- Math problem solving: hmmt_feb25, brumo25, aime24, aime25, omni-math (and many more)
- Formal proofs in Lean: minif2f, proofnet
- Coding skills: scicode, livecodebench, human-eval, mbpp
- Chat/instruction following: ifbench, ifeval, arena-hard
- General knowledge: mmlu, mmlu-pro, gpqa
- Long context: ruler
- Easily parallelize each evaluation across many slurm jobs, self-host LLM judges, bring your own prompts or change benchmark configuration in any other way.
- [Model training](https://nvidia.github.io/NeMo-Skills/pipelines/training): Train models using [NeMo-Aligner](https://github.com/NVIDIA/NeMo-Aligner/), [NeMo-RL](https://github.com/NVIDIA/NeMo-RL/) or [verl](https://github.com/volcengine/verl).
## News
* [07/30/2025]: The datasets used to train OpenReasoning models are released! Math and code are available as part of [Nemotron-Post-Training-Dataset-v1](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v1) and science is available in
[OpenScienceReasoning-2](https://huggingface.co/datasets/nvidia/OpenScienceReasoning-2).
See our [documentation](https://nvidia.github.io/NeMo-Skills/releases/openreasoning/training) for more details.
* [07/18/2025]: We released [OpenReasoning](https://nvidia.github.io/NeMo-Skills/releases/openreasoning/) models! SOTA scores on math, coding and science benchmarks.
![Evaluation Results with pass@1](docs/releases/openreasoning/pass-1.png)
![Evaluation Results with GenSelect](docs/releases/openreasoning/genselect.png)
* [04/23/2025]: We released [OpenMathReasoning](https://nvidia.github.io/NeMo-Skills/openmathreasoning1) dataset and models!
* OpenMathReasoning dataset has 306K unique mathematical problems sourced from [AoPS forums](https://artofproblemsolving.com/community) with:
* 3.2M long chain-of-thought (CoT) solutions
* 1.7M long tool-integrated reasoning (TIR) solutions
* 566K samples that select the most promising solution out of many candidates (GenSelect)
* OpenMath-Nemotron models are SoTA open-weight models on math reasoning benchmarks at the time of release!
* [10/03/2024]: We released [OpenMathInstruct-2](https://nvidia.github.io/NeMo-Skills/openmathinstruct2) dataset and models!
* OpenMathInstruct-2 is a math instruction tuning dataset with 14M problem-solution pairs generated using the Llama3.1-405B-Instruct model.
* OpenMath-2-Llama models show significant improvements compared to their Llama3.1-Instruct counterparts.
## Getting started
To get started, follow these [steps](https://nvidia.github.io/NeMo-Skills/basics),
browse available [pipelines](https://nvidia.github.io/NeMo-Skills/pipelines) or run `ns --help` to see all available
commands and their options.
You can find more examples of how to use NeMo-Skills in the [tutorials](https://nvidia.github.io/NeMo-Skills/tutorials) page.
We've built and released many popular models and datasets using NeMo-Skills. See all of them in the [Papers & Releases](./releases/index.md) documentation.
You can find the full documentation [here](https://nvidia.github.io/NeMo-Skills/).
## Contributing
We welcome contributions to NeMo-Skills! Please see our [Contributing Guidelines](./CONTRIBUTING.md) for more information on how to get involved.
Disclaimer: This project is strictly for research purposes, and not an official product from NVIDIA.
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
executor: local
containers:
trtllm: nvcr.io/nvidia/tensorrt-llm/release:0.21.0
vllm: igitman/nemo-skills-vllm:0.7.0
sglang: igitman/nemo-skills-sglang:0.7.0
nemo: igitman/nemo-skills-nemo:0.7.0
megatron: igitman/nemo-skills-megatron:0.7.0
sandbox: igitman/nemo-skills-sandbox:0.7.0
nemo-skills: igitman/nemo-skills:0.7.0
verl: igitman/nemo-skills-verl:0.7.0
nemo-rl: igitman/nemo-skills-nemo-rl:0.7.0
# add required mounts for models/data here
# the code is mounted automatically inside /nemo_run/code
# but please note that we only package what's tracked by git + jsonl files inside nemo_skills/dataset
# mounts:
# you can define as many as you need, e.g.
# - /mnt/datadrive/models:/models
# - /mnt/datadrive/data:/data
# - /home/<username>/workspace:/workspace
# you can also override container libraries by directly mounting over them. E.g. to override NeMo-Aligner do
# - <...>/NeMo-Aligner:/opt/NeMo-Aligner
\ No newline at end of file
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
executor: slurm
containers:
trtllm: nvcr.io/nvidia/tensorrt-llm/release:0.21.0
vllm: igitman/nemo-skills-vllm:0.7.0
sglang: igitman/nemo-skills-sglang:0.7.0
nemo: igitman/nemo-skills-nemo:0.7.0
megatron: igitman/nemo-skills-megatron:0.7.0
sandbox: igitman/nemo-skills-sandbox:0.7.0
nemo-skills: igitman/nemo-skills:0.7.0
verl: igitman/nemo-skills-verl:0.7.0
nemo-rl: igitman/nemo-skills-nemo-rl:0.7.0
job_name_prefix: "nemo_skills:"
# define this for ssh access
# ssh_tunnel:
# host: <slurm host>
# user: <username>
# job_dir: <some location on slurm cluster to keep job metadata, uploaded code and generated sbatch files>
# identity: <can specify ssh key to avoid entering password>
# if you're running directly from cluster, you only need to define job_dir and shouldn't use ssh_tunnel
# job_dir: <some location on slurm cluster to keep job metadata, uploaded code and generated sbatch files>
# define your account/partition here
# account: <slurm account>
# partition: <slurm partition>
# cpu_partition: <if cluster has a dedicated cpu partition, you can define it here>
# add required mounts for models/data here
# the code is mounted automatically inside /nemo_run/code
# but please note that we only package what's tracked by git + jsonl files inside nemo_skills/dataset
# mounts:
# - <slurm location for your data/models>:<where to mount in a container>
# e.g.
# - <path on slurm>/trt_models:/trt_models
# - <path on slurm>/data:/data
# you can also override container libraries by directly mounting over them. E.g. to override NeMo-Aligner do
# - <path on slurm>/NeMo-Aligner:/opt/NeMo-Aligner
# can use this section to set timeouts for different partitions
# this will be used as a slurm parameter + to signal SFT job to finish
# before the timeout to have time to save the last checkpoint
# timeouts:
# partition_name1: 06:00:00
# partition_name2: 01:30:00
# Dataset Explorer Demo
1. Download data TBD
2. Retrieve similar questions from OpenMathInstruct2. Do it for all benchmarks you want to compare against.
Assuming you're running from this folder.
```
python -m nemo_skills.inference.retrieve_similar \
++retrieve_from=./data.jsonl \
++compare_to="../nemo_skills/dataset/<benchmark>/test.jsonl" \
++output_file=./similar-retrieved-openmath2/<benchmark>.jsonl \
++top_k=5
```
3. Let's do the same for original MATH training set to get a sense of whether OpenMathInstruct-2 is in the same
distribution or not.
```
python -m nemo_skills.inference.retrieve_similar \
++retrieve_from=../nemo_skills/dataset/math/train.jsonl \
++compare_to="../nemo_skills/dataset/<benchmark>/test.jsonl" \
++output_file=./similar-retrieved-math-train/<benchmark>.jsonl \
++top_k=5
```
4. Start the Gradio demo.
```
python visualize_similar.py
```
\ No newline at end of file
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import random
import re
from functools import lru_cache
import gradio as gr
from latex2mathml.converter import convert
from latex2mathml.exceptions import NoAvailableTokensError
@lru_cache(maxsize=1000)
def load_jsonl(file_path):
with open(file_path, 'r') as f:
return [json.loads(line) for line in f]
@lru_cache(maxsize=10000)
def render_latex(text):
def replace_matrix(match):
matrix_content = match.group(1)
rows = matrix_content.split('\\\\')
mml_rows = ''.join(f'<mtr><mtd>{convert_and_clean(row.strip())}</mtd></mtr>' for row in rows)
return f'<mrow><mo>(</mo><mtable rowspacing="4pt" columnspacing="1em">{mml_rows}</mtable><mo>)</mo></mrow>'
def replace_align(match):
align_content = match.group(1)
rows = align_content.split('\\\\')
mml_rows = []
for row in rows:
if '&' in row:
left, right = row.split('&')
mml_row = f'<mtr><mtd columnalign="right">{convert_and_clean(left.strip())}</mtd><mtd columnalign="left">{convert_and_clean(right.strip())}</mtd></mtr>'
else:
mml_row = f'<mtr><mtd columnalign="center">{convert_and_clean(row.strip())}</mtd></mtr>'
mml_rows.append(mml_row)
return f'<mtable columnspacing="1em" rowspacing="3pt" displaystyle="true">{"".join(mml_rows)}</mtable>'
def convert_and_clean(latex):
try:
# Pre-process nested matrices
latex = re.sub(r'\\begin{pmatrix}(.*?)\\end{pmatrix}', replace_matrix, latex, flags=re.DOTALL)
# Handle \displaystyle
latex = latex.replace('\\displaystyle', '')
# Handle nested exponents
latex = re.sub(r'\^{([^{}]+)}', r'^{\1}', latex)
# Convert LaTeX to MathML
mathml = convert(latex)
mathml = re.sub(r'<math.*?>(.*)</math>', r'\1', mathml)
return mathml
except NoAvailableTokensError:
return latex
# Handle align* environment
text = re.sub(
r'\\begin{align\*}(.*?)\\end{align\*}',
lambda m: f'<math xmlns="http://www.w3.org/1998/Math/MathML" display="block">{replace_align(m)}</math>',
text,
flags=re.DOTALL,
)
# Handle display math, excluding intervals
text = re.sub(
r'\[(?![-\d, ]+\])(.*?)\]',
lambda m: f'<math xmlns="http://www.w3.org/1998/Math/MathML" display="block">{convert_and_clean(m.group(1))}</math>',
text,
flags=re.DOTALL,
)
# Handle inline math
text = re.sub(
r'\$(.*?)\$',
lambda m: f'<math xmlns="http://www.w3.org/1998/Math/MathML">{convert_and_clean(m.group(1))}</math>',
text,
)
return text
@lru_cache(maxsize=1000)
def display_entry(index, test_set):
data_openmath2, data_math_train = load_test_sets(f"{test_set}.jsonl")
# Check if the index is valid
if index < 0 or index >= len(data_openmath2):
return f"Error: Invalid index. Please enter a number between 0 and {len(data_openmath2) - 1}."
entry_openmath2 = data_openmath2[index]
entry_math_train = data_math_train[index]
# Check if the current test set is GSM8K
if test_set == "gsm8k":
test_problem = entry_openmath2['problem']
similar_openmath2 = entry_openmath2['similar_items']
similar_math_train = entry_math_train['similar_items']
else:
test_problem = render_latex(entry_openmath2['problem'])
similar_openmath2 = [render_latex(cand) for cand in entry_openmath2['similar_items']]
similar_math_train = [render_latex(cand) for cand in entry_math_train['similar_items']]
html = f"<h2>Test set problem:</h2><p>{test_problem}</p>"
html += "<hr>"
html += "<div style='display: flex;'>"
html += "<div style='flex: 1; padding-right: 10px;'>"
html += "<h2>Most similar OpenMathInstruct-2 problems:</h2><ol>"
for cand in similar_openmath2:
html += f"<li>{cand}</li>"
html += "</ol></div>"
html += "<div style='border-left: 1px solid #ccc;'></div>"
html += "<div style='flex: 1; padding-left: 10px;'>"
html += "<h2>Most similar MATH training set problems:</h2><ol>"
for cand in similar_math_train:
html += f"<li>{cand}</li>"
html += "</ol></div>"
html += "</div>"
return html
def random_entry(data):
return random.randint(0, len(data) - 1)
@lru_cache(maxsize=10)
def load_test_sets(test_set):
file_path_openmath2 = f'./similar-retrieved-openmath2/{test_set}'
file_path_math_train = f'./similar-retrieved-math-train/{test_set}'
data_openmath2 = load_jsonl(file_path_openmath2)
data_math_train = load_jsonl(file_path_math_train)
# Sort both datasets based on the 'problem' field (or use 'id' if available)
data_openmath2.sort(key=lambda x: x['problem'])
data_math_train.sort(key=lambda x: x['problem'])
# Check if the sorted datasets have the same length and matching problems
if len(data_openmath2) != len(data_math_train):
raise ValueError(
f"Datasets have different lengths: OpenMathInstruct-2 ({len(data_openmath2)}) vs MATH training set ({len(data_math_train)})"
)
for i, (entry_openmath2, entry_math_train) in enumerate(zip(data_openmath2, data_math_train)):
if entry_openmath2['problem'] != entry_math_train['problem']:
raise ValueError(
f"Mismatch at index {i}: OpenMathInstruct-2 problem doesn't match MATH training set problem"
)
return data_openmath2, data_math_train
test_sets = [f for f in os.listdir('./similar-retrieved-openmath2') if f.endswith('.jsonl')]
test_set_names = [os.path.splitext(f)[0] for f in test_sets]
if "math.jsonl" in test_sets:
test_sets.remove("math.jsonl")
test_sets.insert(0, "math.jsonl")
test_set_names = [os.path.splitext(f)[0] for f in test_sets]
with gr.Blocks() as demo:
gr.Markdown("# OpenMathInstruct-2 test set contamination explorer")
gr.Markdown(
"During construction of OpenMathInstruct-2 we generated many synthetic problems. "
"We did a very thorough decontamination to remove exact duplicates (including rephrases) with popular benchmarks.<br>"
"Still our dataset contains many questions that are very similar to test sets. "
"To make things more transparent we created this demo, that you can use to explore "
"most similar questions from our data for each of the test set problems.<br>"
"We also provide closest examples from MATH training set, since it was used as seed data "
"to create our dataset and in most cases that training set already contains very similar questions to the test sets!<br>"
"See our full dataset at HuggingFace: [OpenMathInstruct-2](https://huggingface.co/datasets/nvidia/OpenMathInstruct-2)<br>"
"And read our [paper](https://arxiv.org/abs/2410.01560) to learn more about the decontamination process and how we retrieve similar questions."
)
warning_box = gr.Markdown(visible=False)
with gr.Row():
test_set_dropdown = gr.Dropdown(choices=test_set_names, label="Select Test Set", value=test_set_names[0])
index_input = gr.Number(label="Problem Index", value=0, step=1)
random_button = gr.Button("Random Problem")
output = gr.HTML()
current_test_set = gr.State(test_set_names[0])
def update_test_set(test_set):
data_openmath2, data_math_train = load_test_sets(f"{test_set}.jsonl")
warning = ""
warning_visible = False
if test_set == "omni-math":
warning = "⚠️ Since Omni-Math benchmarks was released after we finished training of our models, we didn't perform decontamination with it and some of the problems might match exactly!"
warning_visible = True
return (
0,
display_entry(0, test_set),
warning,
gr.update(visible=warning_visible),
test_set,
gr.update(maximum=len(data_openmath2) - 1), # Update the maximum allowed index
)
def display_entry_wrapper(index, current_test_set):
data_openmath2, _ = load_test_sets(f"{current_test_set}.jsonl")
# Ensure the index is within bounds
index = max(0, min(int(index), len(data_openmath2) - 1))
return display_entry(index, current_test_set)
def random_entry_wrapper(current_test_set):
data_openmath2, _ = load_test_sets(f"{current_test_set}.jsonl")
return random_entry(data_openmath2)
test_set_dropdown.change(
update_test_set,
inputs=[test_set_dropdown],
outputs=[
index_input,
output,
warning_box,
warning_box,
current_test_set,
index_input,
],
)
index_input.change(display_entry_wrapper, inputs=[index_input, current_test_set], outputs=output)
random_button.click(random_entry_wrapper, inputs=[current_test_set], outputs=index_input)
demo.load(display_entry_wrapper, inputs=[index_input, current_test_set], outputs=output)
demo.launch(debug=False, server_name='0.0.0.0', server_port=5005)
FROM nvcr.io/nvidia/pytorch:25.04-py3
# Set working directory
WORKDIR /opt
# Install megatron-lm
ENV MEGATRON_COMMIT=dfc0a3d004391a82d8d8a5a6d991b65eaed0190c
RUN git clone https://github.com/NVIDIA/Megatron-LM && \
cd Megatron-LM && \
git checkout $MEGATRON_COMMIT && \
pip install -e .
# installing libs for hf -> megatron conversion
RUN pip install transformers accelerate
# fix for https://github.com/NVIDIA/NeMo/issues/12836
# there is a global requirements lock that we need to remove..
RUN rm /etc/pip/constraint.txt && touch /etc/pip/constraint.txt
RUN pip install -U "nvidia-modelopt[all]>=0.27"
ENV PYTHONPATH=/opt/Megatron-LM
\ No newline at end of file
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# copied from https://github.com/NVIDIA/NeMo-Aligner/blob/main/Dockerfile
# with pinned NeMo-Aligner version for reproducibility
# To build NeMo-Aligner from a base PyTorch container:
#
# docker buildx build -t aligner:latest .
#
# To update NeMo-Aligner from a pre-built NeMo-Framework container:
#
# docker buildx build --target=aligner-bump -t aligner:latest .
#
# Number of parallel threads for compute heavy build jobs
# if you get errors building TE or Apex, decrease this to 4
ARG MAX_JOBS=8
# Git refs for dependencies
ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
ARG PYTRITON_VERSION=0.5.10
ARG NEMO_TAG=19668e5320a2e2af0199b6d5e0b841993be3a634 # On: main
ARG MLM_TAG=25059d3bbf68be0751800f3644731df12a88f3f3 # On: main
ARG ALIGNER_COMMIT=35fcfd9df754aff56f71cb3ba3382cc02384361a
ARG TRTLLM_VERSION=v0.13.0
ARG PROTOBUF_VERSION=4.24.4
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch:24.07-py3
FROM ${BASE_IMAGE} AS aligner-bump
ARG ALIGNER_COMMIT
WORKDIR /opt
# NeMo Aligner
RUN <<"EOF" bash -exu
if [[ ! -d NeMo-Aligner ]]; then
git clone https://github.com/NVIDIA/NeMo-Aligner.git
fi
cd NeMo-Aligner
git fetch origin '+refs/pull/*/merge:refs/remotes/pull/*/merge'
git checkout -f $ALIGNER_COMMIT
# case 1: ALIGNER_COMMIT is a local branch so we have to apply remote changes to it
# case 2: ALIGNER_COMMIT is a commit, so git-pull is expected to fail
git pull --rebase || true
pip install --no-cache-dir --no-deps -e .
EOF
FROM ${BASE_IMAGE} as final
LABEL "nemo.library"="nemo-aligner"
WORKDIR /opt
# needed in case git complains that it can't detect a valid email, this email is fake but works
RUN git config --global user.email "worker@nvidia.com"
# install latest apex
ARG APEX_TAG
RUN pip uninstall -y apex && \
git clone https://github.com/NVIDIA/apex && \
cd apex && \
if [ ! -z $APEX_TAG ]; then \
git fetch origin $APEX_TAG && \
git checkout FETCH_HEAD; \
fi && \
pip install -v --no-build-isolation --disable-pip-version-check --no-cache-dir --config-settings "--build-option=--cpp_ext --cuda_ext --fast_layer_norm --distributed_adam --deprecated_fused_adam" ./
# Git LFS
RUN curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash && \
apt-get install git-lfs && \
git lfs install && \
apt-get clean
# TRTLLM
ARG TRTLLM_VERSION
RUN git clone https://github.com/NVIDIA/TensorRT-LLM.git && \
cd TensorRT-LLM && \
git checkout ${TRTLLM_VERSION} && \
. docker/common/install_tensorrt.sh && \
python3 ./scripts/build_wheel.py --job_count $(nproc) --trt_root /usr/local/tensorrt --python_bindings --benchmarks && \
pip install -e .
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda-12/compat/lib.real/
# install TransformerEngine
ARG MAX_JOBS
ARG TE_TAG
RUN pip uninstall -y transformer-engine && \
git clone https://github.com/NVIDIA/TransformerEngine.git && \
cd TransformerEngine && \
if [ ! -z $TE_TAG ]; then \
git fetch origin $TE_TAG && \
git checkout FETCH_HEAD; \
fi && \
git submodule init && git submodule update && \
NVTE_FRAMEWORK=pytorch NVTE_WITH_USERBUFFERS=1 MPI_HOME=/usr/local/mpi pip install .
RUN pip install fire
# place any util pkgs here
ARG PYTRITON_VERSION
RUN pip install --upgrade-strategy only-if-needed nvidia-pytriton==$PYTRITON_VERSION
ARG PROTOBUF_VERSION
RUN pip install -U --no-deps protobuf==$PROTOBUF_VERSION
RUN pip install --upgrade-strategy only-if-needed jsonlines
# NeMo
ARG NEMO_TAG
RUN git clone https://github.com/NVIDIA/NeMo.git && \
cd NeMo && \
git pull && \
if [ ! -z $NEMO_TAG ]; then \
git fetch origin $NEMO_TAG && \
git checkout FETCH_HEAD; \
fi && \
pip uninstall -y nemo_toolkit sacrebleu && \
pip install -e ".[nlp]" && \
cd nemo/collections/nlp/data/language_modeling/megatron && make
# MLM
ARG MLM_TAG
RUN pip uninstall -y megatron-core && \
git clone https://github.com/NVIDIA/Megatron-LM.git && \
cd Megatron-LM && \
git pull && \
if [ ! -z $MLM_TAG ]; then \
git fetch origin $MLM_TAG && \
git checkout FETCH_HEAD; \
fi && \
pip install -e .
COPY --from=aligner-bump /opt/NeMo-Aligner /opt/NeMo-Aligner
RUN cd /opt/NeMo-Aligner && \
pip install --no-deps -e .
RUN cd TensorRT-LLM && patch -p1 < ../NeMo-Aligner/setup/trtllm.patch
# TODO(terryk): This layer should be deleted ASAP after NeMo is bumped to include all of these PRs
RUN <<"EOF" bash -exu
cd NeMo
# Ensures we don't cherry-pick "future" origin/main commits
git fetch -a
# 0c92fe17df4642ffc33d5d8c0c83fda729e3910c: [fix] Ensures disabling exp_manager with exp_manager=null does not error NeMo#10651
# 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652
# 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863
# (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654
for pr_and_commit in \
"10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \
"10652 60e677423667c029dd05875da72bf0719774f844" \
"10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \
; do
pr=$(cut -f1 -d' ' <<<"$pr_and_commit")
head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit")
git fetch origin $head_pr_commit:PR-${pr}
# cherry-picks all commits between main and the top of the PR
git cherry-pick --allow-empty $(git merge-base origin/main PR-${pr})..PR-${pr}
# Tag cherry-picks to help
git tag cherry-pick-PR-${pr}
done
EOF
# patching gpt sft dataset to properly support packing
# TODO: remove when integrated in NeMo
COPY nemo_skills/training/gpt_sft_dataset.py /opt/NeMo/nemo/collections/nlp/data/language_modeling/megatron/gpt_sft_dataset.py
# copied and edited from https://github.com/NVIDIA/NeMo-RL/blob/main/docker/Dockerfile
ARG BASE_IMAGE=nvcr.io/nvidia/cuda-dl-base:25.05-cuda12.9-devel-ubuntu24.04
FROM ${BASE_IMAGE} AS base
# It is more convenient for users to run as root
USER root
RUN <<"EOF" bash -exu -o pipefail
export DEBIAN_FRONTEND=noninteractive
export TZ=America/Los_Angeles
apt-get update
apt-get install -y --no-install-recommends \
jq \
curl \
git \
rsync \
wget \
less \
vim \
# Nsight
apt install -y --no-install-recommends gnupg
echo "deb http://developer.download.nvidia.com/devtools/repos/ubuntu$(source /etc/lsb-release; echo "$DISTRIB_RELEASE" | tr -d .)/$(dpkg --print-architecture) /" | tee /etc/apt/sources.list.d/nvidia-devtools.list
apt-key adv --fetch-keys http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
apt update
apt install -y nsight-systems-cli
apt-get clean
rm -rf /var/lib/apt/lists/*
EOF
# Install uv and python
ARG UV_VERSION=0.7.2
ARG PYTHON_VERSION=3.12
ENV PATH="/root/.local/bin:$PATH"
RUN curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh && \
uv python install ${PYTHON_VERSION}
# Disable usage stats by default for users who are sensitive to sharing usage.
# Users are encouraged to enable if the wish.
ENV RAY_USAGE_STATS_ENABLED=0
ENV NEMO_RL_VENV_DIR=/opt/ray_venvs
FROM base AS hermetic
ARG NEMO_RL_COMMIT
ENV NEMO_RL_COMMIT=${NEMO_RL_COMMIT:-51f8b2672535e30a588f988ce65871442c5109df}
RUN git clone https://github.com/NVIDIA/NeMo-RL.git /opt/NeMo-RL && cd /opt/NeMo-RL && git checkout ${NEMO_RL_COMMIT}
WORKDIR /opt/NeMo-RL
# Variables to control the build of TE. If there are issues with parallelization, consider
# setting these to 1.
ARG MAX_JOBS=4
ARG NVTE_BUILD_THREADS_PER_JOB=1
ENV UV_PROJECT_ENVIRONMENT=/opt/nemo_rl_venv
ENV UV_LINK_MODE=copy
# Create and activate virtual environment
RUN <<"EOF" bash -exu
uv venv ${UV_PROJECT_ENVIRONMENT}
# uv sync has a more reliable resolver than simple uv pip install which can fail
# Sync each training + inference backend one at a time (since they may conflict)
# to warm the uv cache, then at the end just sync the default dependencies.
# Do everything in one layer to prevent large layers.
# The venv is symlinked to avoid bloating the layer size
uv sync --link-mode symlink --locked --no-install-project
uv sync --link-mode symlink --locked --extra vllm --no-install-project
uv sync --link-mode symlink --locked --extra mcore --no-install-project
uv sync --link-mode symlink --locked --all-groups --no-install-project
EOF
ENV PATH="/opt/nemo_rl_venv/bin:$PATH"
ENV NEMO_RL_VENV_DIR=/opt/ray_venvs
# Prefetch all virtual environments
# Copy entire source to temp location, run prefetch, then clean up
RUN cp -r /opt/NeMo-RL /tmp/nemo-rl-prefetch && cd /tmp/nemo-rl-prefetch && \
UV_PROJECT_ENVIRONMENT="/tmp/nemo-rl-prefetch/.venv" uv run nemo_rl/utils/prefetch_venvs.py && \
cd / && \
rm -rf /tmp/nemo-rl-prefetch
RUN git clone https://github.com/NVIDIA/NeMo-Skills.git /opt/NeMo-Skills && cd /opt/NeMo-Skills && uv pip install .
FROM python:3.10
RUN apt-get update && apt-get -y install curl git git-lfs
# for ifeval benchmark
# TODO: can we get just a single dir?
RUN mkdir /opt/benchmarks
RUN git clone https://github.com/google-research/google-research.git /opt/benchmarks/google-research --depth=1
# ifbench
RUN git clone https://github.com/allenai/IFBench.git /opt/benchmarks/IFBench --depth=1
RUN cd /opt/benchmarks/IFBench && sed -i '/^unicodedata[=<>]*.*$/d' requirements.txt && pip install -r requirements.txt
RUN cd /opt/benchmarks
RUN pip install langdetect absl-py immutabledict nltk ipython && \
python -c "import nltk; nltk.download('punkt'); nltk.download('punkt_tab')"
RUN git clone https://github.com/ShishirPatil/gorilla.git /opt/gorilla
RUN cd /opt/gorilla && git checkout d2177992bbba9aa228b53c0645bf8f5613a5a7c6
RUN cd /opt/gorilla/berkeley-function-call-leaderboard && pip install -e .
# installing apptainer
RUN apt install -y wget && \
cd /tmp && \
wget https://github.com/apptainer/apptainer/releases/download/v1.4.1/apptainer_1.4.1_amd64.deb && \
apt install -y ./apptainer_1.4.1_amd64.deb
RUN mkdir -p /opt/NeMo-Skills/requirements
COPY pyproject.toml README.md /opt/NeMo-Skills/
COPY nemo_skills /opt/NeMo-Skills/nemo_skills/
COPY requirements /opt/NeMo-Skills/requirements/
RUN cd /opt/NeMo-Skills && pip install -e .[all]
\ No newline at end of file
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Use the base image with Python 3.10 and Flask
FROM tiangolo/uwsgi-nginx-flask:python3.10
# Install dependencies required for Lean 4 and other tools
RUN apt-get update && \
apt-get install -y curl git bzip2 && \
curl -L https://downloads.python.org/pypy/pypy3.10-v7.3.17-linux64.tar.bz2 -o /tmp/pypy.tar.bz2 && \
tar -xjf /tmp/pypy.tar.bz2 -C /opt/ && \
ln -s /opt/pypy3.10-v7.3.17-linux64/bin/pypy3 /usr/local/bin/pypy3 && \
rm /tmp/pypy.tar.bz2
RUN curl https://raw.githubusercontent.com/leanprover/elan/master/elan-init.sh -sSf | sh -s -- -y && \
/root/.elan/bin/elan toolchain install leanprover/lean4:v4.12.0 && \
/root/.elan/bin/elan default leanprover/lean4:v4.12.0 && \
/root/.elan/bin/elan self update
# Set environment variables to include Lean and elan/lake in the PATH
ENV PATH="/root/.elan/bin:$PATH"
# Create Lean project directory and initialize a new Lean project with Mathlib4
RUN mkdir -p /lean4 && cd /lean4 && \
/root/.elan/bin/lake new my_project && \
cd my_project && \
echo 'leanprover/lean4:v4.12.0' > lean-toolchain && \
echo 'require mathlib from git "https://github.com/leanprover-community/mathlib4" @ "v4.12.0"' >> lakefile.lean
# Download and cache Mathlib4 to avoid recompiling, then build the project
RUN cd /lean4/my_project && \
/root/.elan/bin/lake exe cache get && \
/root/.elan/bin/lake build
# Set environment variables to include Lean project path
ENV LEAN_PATH="/lean4/my_project"
ENV PATH="/lean4/my_project:$PATH"
# Set up application code and install Python dependencies
COPY requirements/code_execution.txt /app/requirements.txt
RUN pip install --no-cache-dir -r /app/requirements.txt
COPY nemo_skills/code_execution/local_sandbox/local_sandbox_server.py /app/main.py
# For scicode eval
RUN mkdir /data && pip install gdown && \
python -c "import gdown; url = f'https://drive.google.com/uc?id=17G_k65N_6yFFZ2O-jQH00Lh6iaw3z-AW'; gdown.download(url, '/data/test_data.h5', quiet=False)"
# Set the working directory to /app
WORKDIR /app
# Set Flask app environment variables and ports
ARG UWSGI_CHEAPER
ENV UWSGI_CHEAPER=$UWSGI_CHEAPER
ARG UWSGI_PROCESSES
ENV UWSGI_PROCESSES=$UWSGI_PROCESSES
ENV LISTEN_PORT=6000
FROM lmsysorg/sglang:v0.4.10.post2-cu126
# patching for sharding states support for DeepSeek-R1
COPY dockerfiles/sglang.patch /sgl-workspace/sglang.patch
RUN cd /sgl-workspace/sglang && git apply /sgl-workspace/sglang.patch
\ No newline at end of file
FROM whatcanyousee/verl:ngc-cu124-vllm0.8.5-sglang0.4.6-mcore0.12.0-te2.3
# Set working directory
WORKDIR /opt
# Install verl
ENV VERL_COMMIT=2ed63bbf39c22724e4940d97e4b09e4f3e5f6d68
RUN git clone https://github.com/volcengine/verl.git && \
cd verl && \
git checkout ${VERL_COMMIT} && \
pip3 install -e .
RUN pip install fire
RUN pip3 install -U pynvml
WORKDIR /workspace
# Fix CV2
RUN pip install opencv-fixer==0.2.5 && \
python -c "from opencv_fixer import AutoFix; AutoFix()"
# Run additional dependencies
RUN pip install math-verify[antlr4_9_3] ray[default] pylatexenc wandb
CMD ["/usr/bin/bash"]
\ No newline at end of file
FROM vllm/vllm-openai:v0.10.0
# adding editable vllm installation to allow overriding python code with a custom mount
RUN cd /opt && git clone https://github.com/vllm-project/vllm.git && cd vllm && git checkout v0.10.0 && VLLM_USE_PRECOMPILED=1 pip install -e .
# How to build all necessary dockerfiles
Some dockerfiles are directly included in this folder and for some others the instructions to build them are below.
To build one of the existing dockerfiles use a command like this
```
docker build -t igitman/nemo-skills-nemo:0.6.1 -f dockerfiles/Dockerfile.nemo .
```
It might take a long time for some of the images.
## Building trtllm image
We directly use official TensorRT-LLM ngc containers. Current version is `nvcr.io/nvidia/tensorrt-llm/release:0.21.0`.
diff --git a/python/sglang/srt/model_loader/loader.py b/python/sglang/srt/model_loader/loader.py
index e2c6a37..4ee6347 100644
--- a/python/sglang/srt/model_loader/loader.py
+++ b/python/sglang/srt/model_loader/loader.py
@@ -653,6 +653,11 @@ class ShardedStateLoader(BaseModelLoader):
state_dict.pop(key)
if state_dict:
raise ValueError(f"Missing keys {tuple(state_dict)} in loaded state!")
+
+ if hasattr(model, "post_load_weights"):
+ print("Post loading weights")
+ model.post_load_weights()
+
return model.eval()
@staticmethod
# Chat Interface
The chat interface provides a web UI where you can interactively chat with a deployed model. It supports features like multi-turn conversations and, for certain models like [OpenMath-Nemotron](https://huggingface.co/collections/nvidia/openmathreasoning-68072c0154a5099573d2e730), code execution capabilities.
![Chat Interface Demo](../assets/chat_interface_demo.gif)
## Launching
There are two main ways to launch the chat interface:
### 1. Via `ns start_server`
You can launch the chat interface alongside the model server directly on a cluster or remote machine using the `ns start_server` command:
```bash
ns start_server \
--model Qwen/Qwen3-8B \
--server_type vllm \
--server_gpus 1 \
--config local \
--launch_chat_interface \
[--extra_chat_args "<hydra_options_for_chat_ui>"]
```
### 2. Manual Launch
Alternatively, you can launch the chat interface manually if you have the `nemo_skills` environment installed locally. This method is suitable when you want to connect to an already running model server.
```bash
python -m nemo_skills.inference.chat_interface.launch server_type=vllm [other_hydra_options]
```
Replace `MODEL_CONFIG` with the path to your model's configuration (e.g., `model_config_path=/path/to/model/config.json`) and `SERVER_TYPE` with the type of server you are connecting to (e.g., `server_type=vllm`).
All relevant parameters for the chat interface, such as the model details, server endpoint, and UI elements, can be configured via Hydra command-line arguments. For a comprehensive list of configurable parameters, please refer to the configuration schema in `nemo_skills/inference/chat_interface/core.py`.
When launched this way, the chat interface will run on the same node as the model server.
#### Accessing the Interface (Cluster/Remote Launch)
To access the chat interface when it's launched via `ns start_server` on a remote machine or cluster, you'll need to set up an SSH tunnel to forward the port (default is `7860`) from the remote machine to your local machine.
* **For Slurm clusters:**
Use the following command, replacing `cluster` with the slurm cluster hostname or IP address, `username` with your username, and `node-name` with the name of the node where the server is running:
```bash
ssh -J cluster -N -f -L localhost:7860:localhost:7860 username@node-name
```
* **For remote workstations/servers:**
Use the following command, replacing `username` with your username and `server` with the hostname or IP address of the remote machine:
```bash
ssh -N -f -L localhost:7860:localhost:7860 username@server
```
Once the tunnel is established, you can access the interface by navigating to `http://localhost:7860` in your web browser.
# Cluster configs
All of the [pipeline scripts](../pipelines/index.md) accept `--cluster` argument which you can use
to control where the job gets executed (you need a "local" cluster config to run jobs locally as well).
That argument picks up one of the configs inside your local
[cluster_configs](https://github.com/NVIDIA/NeMo-Skills/tree/main/cluster_configs)
folder by default, but you can specify another location with `--config_dir` or set it in `NEMO_SKILLS_CONFIG_DIR` env variable.
You can also use `NEMO_SKILLS_CONFIG` env variable instead of the `--cluster` parameter.
The cluster config defines an executor (local or slurm), mounts for data/model access and (slurm-only) various parameters
such as account, partition, ssh-tunnel arguments and so on.
The recommended way to launch jobs on slurm is by running all commands locally and specifying `ssh_tunnel` portion in cluster config
to let [NeMo-Run](https://github.com/NVIDIA/NeMo-Run) know how to connect there.
But if you prefer to run from the cluster directly, you can install NeMo-Skills there
and then only specify `job_dir` parameter without using `ssh_tunnel` section in the config.
You can see example configs in [cluster_configs](https://github.com/NVIDIA/NeMo-Skills/tree/main/cluster_configs) folder.
To create a new config you can either rename and modify one of the examples or run
```bash
ns setup
```
that will help to create all necessary configs step-by-step.
## Environment variables
You can define environment variables in the cluster config file, which will be set inside the container.
```yaml
env_vars:
- MYENVVAR # will pick the value from env
- MYENVVAR2=my_value # will use my_value
```
If an environment variable is required, and you want us to fail if it's not provided,
you can use `required_env_vars` instead. One thing to note is that `required_env_vars` does not support
passing values directly, so you must provide them via environment variable only.
Depending on which pipelines you run, you might need to define the following environment variables
``` bash
# only needed for training (can opt-out with --disable_wandb)
export WANDB_API_KEY=...
# only needed if using gated models, like llama3.1
export HF_TOKEN=...
# only needed if running inference with OpenAI models
export OPENAI_API_KEY=...
# only needed if running inference with Azure OpenAI models
export AZURE_OPENAI_API_KEY=...
# only needed if running inference with Nvidia NIM models
export NVIDIA_API_KEY=...
```
## Useful tips
Here are some suggestions on what can be defined in cluster configs for different use-cases
1. Set `HUGGINGFACE_HUB_CACHE` environment variable to ensure all HuggingFace downloads are cached
2. If you want to have a custom version of one of the underlying libraries that we use
(e.g. [NeMo](https://github.com/NVIDIA/NeMo) or [verl](https://github.com/volcengine/verl)),
you can clone it locally (or on cluster if using slurm), make your changes and then override in the container with
```yaml
mounts:
- <your path>/NeMo:/opt/NeMo
- <your path>/verl:/opt/verl
```
3. You can specify custom containers - our code should work out-of-the-box or with very little changes with different
versions of inference libraries (e.g. [vLLM](https://github.com/vllm-project/vllm)) or training libraries
(e.g. [NeMo](https://github.com/NVIDIA/NeMo)). If you get some errors, you might also need to modify the entry-point
scripts we use, e.g.
[nemo_skills/inference/server/serve_vllm.py](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/inference/server/serve_vllm.py)
or [nemo_skills/training/start_sft.py](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/training/start_sft.py)
4. For slurm clusters it's recommended to [build .sqsh files](https://github.com/NVIDIA/enroot/blob/master/doc/cmd/import.md#example)
for all containers and reference the cluster path
# Code packaging
We use [NeMo-Run](https://github.com/NVIDIA/NeMo-Run) for managing our experiments with local and slurm-based
execution supported (please open an issue if you need to run our code on other kinds of clusters).
This means that even if you need to submit jobs on slurm, you can do it from your local machine by defining an
appropriate cluster config and nemo-run will package and upload your code, data and manage
all complexities of slurm scheduling. Check their documentation to learn how to fetch logs, check status,
cancel jobs, etc.
To decide which code to package we use the following logic:
1. If you run commands from inside a cloned NeMo-Skills repository, we will package that repository.
2. If you run commands from inside a git repository which is not NeMo-Skills (doesn't have `nemo_skills` top-level folder),
we will package your current repository and also include `nemo_skills` subfolder from its installed location.
3. If you run commands from outside of any git repository, we will only package `nemo_skills` subfolder from its installed
location.
Put simply, we will always include `nemo_skills` and will additionally include your personal git repository if you're
running commands from it.
!!! note
When packaging a git repository, NeMo-Run will only package the code tracked by git
(as well as all jsonl files from `nemo_skills/dataset`).
Any non-tracked files will not be automatically available inside the container or uploaded to slurm.
When packaging `nemo_skills` from its installed location (which might not be a git repository), we will
upload **all** the files inside `nemo_skills` subfolder. Make sure you do not store any large files there
to avoid uploading them on the cluster with each experiment!
!!! note
When you run commands from a git repo with uncommitted changes, NeMo-Run throws the following error
```
RuntimeError: Your repo has uncommitted changes. Please commit your changes or set check_uncommitted_changes to False to proceed with packaging.
```
This error can be avoided by either taking care of the uncommitted changes (via commit/revert), or setting the environment variable:
```bash
export NEMO_SKILLS_DISABLE_UNCOMMITTED_CHANGES_CHECK=1
```
In all cases, uncommitted code will not be used.
Finally, it's important to keep in mind that whenever you submit a new experiment, NeMo-Run will create a copy of your
code package both locally (inside `~/.nemo_run`) and on cluster (inside `ssh_tunnel/job_dir` path in your cluster config).
If you submit multiple experiments from the same Python script, they will all share code, so only one copy will be
created per run of that script. Even so, at some point, the code copies will be accumulated and you will run out of
space both locally and on cluster. There is currently no automatic cleaning, so you have to monitor for that and
periodically remove local and cluster nemo-run folders to free up space. There is no side effect of doing that (they will
be automatically recreated) as long as you don't have any running jobs when you remove the folders.
If you want to have more fine-grained control over code reuse, you can directly specify `--reuse_code_exp` argument when submitting jobs
While our job submission is somewhat complicated and goes through NeMo-Run, at the end, we simply execute a particular sbatch file
that is uploaded to the cluster. It is helpful sometimes to see what's in it and modify directly. You can find sbatch file(s)
for each job inside `ssh_tunnel.job_dir` cluster folder that is defined in your cluster config.
# Inference
Here are the instructions on how to run inference with our repo.
## Download/convert the model
Get the model you want to use. You can use any model that's supported by vLLM, sglang, TensorRT-LLM or Megatron.
You can also use [Nvidia NIM API](https://www.nvidia.com/en-us/ai/) for models that are hosted there.
## Start the server
Start the server hosting your model. Here is an example (make sure the `/hf_models` mount is defined in your cluster config). Skip this step if you want to use cloud models through an API.
```bash
ns start_server \
--cluster local \
--model /hf_models/Meta-Llama-3.1-8B-Instruct \
--server_type vllm \
--server_gpus 1 \
--server_nodes 1
```
If the model needs to execute code, add `--with_sandbox`
You could also launch an interactive web chat application by adding `--launch_chat_interface`, for more details see the [Chat Interface documentation](chat_interface.md).
## Send inference requests
Click on :material-plus-circle: symbols in the snippet below to learn more details.
=== "Self-hosted models"
```python
from nemo_skills.inference.model import get_model
from nemo_skills.prompt.utils import get_prompt
llm = get_model(server_type="vllm") # localhost by default
prompt = get_prompt('generic/default', 'llama3-instruct') # (1)!
prompt = prompt.fill({'question': "What's 2 + 2?"})
print(prompt) # (2)!
output = llm.generate_sync(prompt=prompt)
print(output["generation"]) # (3)!
```
1. Here we use [generic/default](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/config/generic/default.yaml) config
and [llama3-instruct](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/template/llama3-instruct.yaml) template.
See [nemo_skills/prompt](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt) for more config/template options
or [create your own prompts](prompt-format.md)
2. This should print
```python-console
>>> print(prompt)
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
<|eot_id|><|start_header_id|>user<|end_header_id|>
What's 2 + 2?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
```
If you don't want to use our prompt class, just create this string yourself
3. This should print
```python-console
>>> print(output["generation"])
2 + 2 = 4.
```
=== "API models"
```python
from nemo_skills.inference.model import get_model
from nemo_skills.prompt.utils import get_prompt
llm = get_model( # (1)!
server_type="openai", # NIM models are using OpenAI API
base_url="https://integrate.api.nvidia.com/v1",
model="meta/llama-3.1-8b-instruct",
)
prompt = get_prompt('generic/default') # (2)!
prompt = prompt.fill({'question': "What's 2 + 2?"})
print(prompt) # (3)!
output = llm.generate_sync(prompt=prompt)
print(output["generation"]) # (4)!
```
1. Don't forget to define `NVIDIA_API_KEY`.
To use OpenAI models, use `OPENAI_API_KEY` and set `base_url=https://api.openai.com/v1`.
2. Here we use [generic/default](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/config/generic/default.yaml) config.
Note that with API models we can't add special tokens, so prompt template is not specified.
See [nemo_skills/prompt](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt) for more config/template options
or [create your own prompts](prompt-format.md)
3. This should print
```python-console
>>> print(prompt)
[{'role': 'user', 'content': "What's 2 + 2?"}]
```
If you don't want to use our prompt class, just create this list yourself
4. This should print
```python-console
>>> print(output["generation"])
2 + 2 = 4.
```
=== "With code execution"
``` python
from nemo_skills.code_execution.sandbox import get_sandbox
from nemo_skills.inference.model import get_code_execution_model
from nemo_skills.prompt.utils import get_prompt
sandbox = get_sandbox() # localhost by default
llm = get_code_execution_model(server_type="vllm", sandbox=sandbox)
prompt = get_prompt('generic/default', 'llama3-instruct', code_tags='llama3') # (1)!
prompt.config.system = ( # (2)!
"Environment: ipython\n\n"
"Use Python to solve this math problem."
)
prompt = prompt.fill({'question': "What's 2 + 2?"})
print(prompt) # (3)!
output = llm.generate_sync(prompt=prompt, **prompt.get_code_execution_args()) # (4)!
print(output["generation"]) # (5)!
```
1. Here we use [generic/default](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/config/generic/default.yaml) config
and [llama3-instruct](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/template/llama3-instruct.yaml) template.
Note how we are updating system message on the next line (you can also include it in the config directly).
See [nemo_skills/prompt](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt) for more config/template options
or [create your own prompts](prompt-format.md)
2. 8B model doesn't always follow these instructions, so using 70B or 405B for code execution is recommended.
3. This should print
```python-console
>>> print(prompt)
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
Environment: ipython
Use Python to solve this math problem.<|eot_id|><|start_header_id|>user<|end_header_id|>
What's 2 + 2?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
```
If you don't want to use our prompt class, just create this string yourself
4. `prompt.get_code_execution_args()` simply returns a dictionary with start/stop tokens,
so that we know when to stop LLM generation and how to format the output.
If you don't want to use our prompt class, just define those parameters directly.
5. This should print
```python-console
>>> print(output["generation"])
<|python_tag|>print(2 + 2)<|eom_id|><|start_header_id|>ipython<|end_header_id|>
completed
[stdout]
4
[/stdout]<|eot_id|><|start_header_id|>assistant<|end_header_id|>
The answer is 4.
```
The "4" in the stdout is coming directly from Python interpreter running in the sandbox.
Note that for self-hosted models we are explicitly adding all the special tokens before sending prompt to an LLM.
This is necessary to retain flexibility. E.g. this way we can use base model format with
instruct models that we found to work better with few-shot examples.
You can learn more about how our prompt formatting works in the [prompt format docs](../basics/prompt-format.md).
!!! note
You can also use slurm config when launching a server. If you do that, add `host=<slurm node hostname>`
to the `get_model/sandbox` calls and define `NEMO_SKILLS_SSH_KEY_PATH` and `NEMO_SKILLS_SSH_SERVER` env vars
to set the connection through ssh.
\ No newline at end of file
# Prompt utilities
Our prompts are configured via three yaml files:
1. **Prompt template** - defines model-specific chat format and special tokens
2. **Prompt config** - contains the actual prompt text with placeholders
3. **Code tags** - specifies code formatting tokens, required for code execution
## Prompt template
The template file defines model-specific special tokens, e.g. bos, turn tokens,
user/assistant/system message, etc. All of the
templates that we support by default are available in
[nemo_skills/prompt/template](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/template)
folder. Here is an example template for
[llama3-instruct](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/template/llama3-instruct.yaml) models:
```yaml
# Prompt specification for the original Llama3-instruct model
# these tokens are always used to construct a prompt like this
#
# single-turn:
# <text_begin><system_begin>{system}<system_end><user_begin>{user}<user_end><assistant_begin>{generation}
# multi-turn:
# <text_begin><system_begin>{system}<system_end><user_begin>{user1}<user_end><assistant_begin>{assistant1}<assistant_end>...
# <user_begin>{userN}<user_end><assistant_begin>{generation}
text_begin: "<|begin_of_text|>"
system_begin: "<|start_header_id|>system<|end_header_id|>\n\n"
system_end: "<|eot_id|>"
user_begin: "<|start_header_id|>user<|end_header_id|>\n\n"
user_end: "<|eot_id|>"
assistant_begin: "<|start_header_id|>assistant<|end_header_id|>\n\n"
assistant_end: "<|eot_id|>"
stop_phrases: ["<|eot_id|>"]
```
You can specify a particular template with `++prompt_template=...`. If you don't add a .yaml extension (e.g.
`++prompt_template=llama3-instruct`), we assume you want to use one of the existing templates and will search
in the included folder. If you provide a full path, we will take the file you specify instead.
!!! note
If you're using OpenAI server type (models are hosted elsewhere), you cannot provide the template
as we cannot add any special tokens and have to send the user/assistant messages following the OpenAI API.
For self-hosted models with TensorRT-LLM, the template is required, but for other servers it's optional.
## Prompt config
The prompt config contains user and system messages with placeholders for keys from a data file.
The configs are model independent (any model can be used with any config).
All of the configs that we support by default are available in
[nemo_skills/prompt/config](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/config)
folder. Here is an example prompt for
[math evaluations](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/config/generic/math.yaml):
```yaml
# default prompt for all math benchmarks (e.g. gsm8k, math)
few_shot_examples:
prefix: "Here are some examples of problems and solutions you can refer to.\n\n"
template: "Problem:\n{problem}\n\nSolution:\n{solution}\n\n\n\n\n\n"
suffix: "Here is the problem you need to solve:\n"
# this is built as <prefix>{template.format(example1)}{template.format(example2)}...{template.format(exampleN)}<suffix>
# and available as {examples} key in the final prompt
# if examples_type is not specified, then {examples} will be empty
# by default there are no examples, but can be changed from code/cmd
system: ""
user: |-
Solve the following math problem. Make sure to put the answer (and only answer) inside \boxed{{}}.
{examples}{problem}
```
Note that we use `{problem}`, `{solution}` and `{examples}` format strings here. The `{examples}` is a special
key that will be used to include few shot examples you specify above (it's empty unless you add `++examples_type` or
specify it in the config like e.g. in
[llama3-gsm8k prompt](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/config/generic/gsm8k.yaml)).
All other keys will need to be specified when you call `prompt.fill`
(more on that in the [prompt-api section](#prompt-api)) so that we can replace placeholders with actual input.
The input for few shot examples always comes from one of the available example types in
[here](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/few_shot_examples/__init__.py). E.g. in the
[llama3-gnstruct/gsm8k](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/config/llama3-instruct/gsm8k.yaml)
prompt the `gsm8k_standard_few_shot` examples from
[here](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/few_shot_examples/gsm8k.py) are used.
## Code tags
Code tags define the special tokens that models use to mark executable code blocks and their output. Code tags are required when using code execution.
All code tags that we support by default are available in
[nemo_skills/prompt/code_tags](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/code_tags).
Here is an example code tags file for the [llama3](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/prompt/code_tags/llama3.yaml) family:
```yaml
# Code tags for llama3 family models
# used to execute code within these tags
code_begin: "<|python_tag|>"
code_end: "<|eom_id|>"
# used to extract the code output
code_output_begin: "<|start_header_id|>ipython<|end_header_id|>"
code_output_end: "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
# how to post-process the captured output (choices: llama, qwen)
code_output_format: "llama"
```
## Prompt API
If you're running one of the pipeline scripts, you can control the prompt by using:
```bash
++prompt_template=...
++prompt_config=...
++code_tags=...
++examples_type=...
```
If you're implementing a new script, you can use the following code to create a prompt and then use it:
```python
from nemo_skills.prompt.utils import get_prompt
# The third parameter is optional and only needed for code execution
prompt = get_prompt('generic/math', 'llama3-instruct', code_tags='llama3')
print(prompt.fill({'problem': "What's 2 + 2?"}))
```
which outputs
```python-console
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
<|eot_id|><|start_header_id|>user<|end_header_id|>
Solve the following math problem. Make sure to put the answer (and only answer) inside \boxed{}.
What's 2 + 2?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
```
Or if you want to skip the template and use OpenAI API
```python
from nemo_skills.prompt.utils import get_prompt
prompt = get_prompt('generic/math')
print(prompt.fill({'problem': "What's 2 + 2?"}))
```
which outputs
```python-console
[
{
'role': 'user',
'content': "Solve the following math problem. Make sure to put the answer (and only answer) inside \\boxed{}.\n\nWhat's 2 + 2?"
}
]
```
You can also have a look at the [tests](https://github.com/NVIDIA/NeMo-Skills/tree/main/tests/test_prompts.py) to see more examples of using our prompt API.
## Multi-turn prompts
If your data is naturally multi-turn (e.g. user-assistant conversations), you can use a special parameter `multi_turn_key` to format
all conversation together. It can be of any length, as long as each entry except last has a special `assistant` key. The prompt config
will be applied on each list entry separately. Here is an example
```python
from nemo_skills.prompt.utils import get_prompt
prompt = get_prompt('generic/default')
data = {'turns': [{'question': "What's 2 + 2?", 'assistant': "easy, that's 5!"}, {'question': 'Can you double check?'}]}
print(prompt.fill(data, multi_turn_key='turns'))
```
which outputs
```python-console
[
{
'role': 'user',
'content': "What's 2 + 2?"
},
{
'role': 'assistant',
'content': "easy, that's 5!"
},
{
'role': 'user',
'content': 'Can you double check?'
}
]
```
or if using template
```python
from nemo_skills.prompt.utils import get_prompt
prompt = get_prompt('generic/default', 'llama3-instruct')
data = {'turns': [{'question': "What's 2 + 2?", 'assistant': "easy, that's 5!"}, {'question': 'Can you double check?'}]}
print(prompt.fill(data, multi_turn_key='turns'))
```
which outputs
```python-console
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
<|eot_id|><|start_header_id|>user<|end_header_id|>
What's 2 + 2?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
easy, that's 5!<|eot_id|><|start_header_id|>user<|end_header_id|>
Can you double check?<|eot_id|><|start_header_id|>assistant<|end_header_id|>
```
# Sandbox for code execution
Our pipeline relies on Python interpreter to execute code generated by LLMs. This creates a security risk,
since we are executing arbitrary code that we do not have full control over. To partially address this,
we provide a basic sandbox that we use to execute code and validate the correctness of LLM-generated answers.
## Local sandbox
The default sandbox option used in our pipeline is a local docker container.
Check out [nemo_skills/code_execution/local_sandbox](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/code_execution/local_sandbox)
for implementation details.
Please note that our provided sandbox is not fully secure and you are strongly encouraged to
setup a properly configured virtual machine such that generated code executes in an unprivileged environment
with no external network access unless necessary.
## Piston sandbox
A better alternative is to host a [Piston server](https://github.com/engineer-man/piston)
in a properly configured VM. If you're using a Piston server (you need to host it yourself),
add the following parameters to the relevant scripts
```bash
++sandbox_type=piston
++sandbox.host=<where your server is hosted, e.g. https://emkc.org/api/v2/piston>
```
## Other sandboxes
Our sandbox API makes no assumptions on where or how the code is executed, so it's very easy
to extend it. E.g. you can use AWS Lambda functions or other similar offerings.
Please open an issue if you'd like us to add support for another sandbox in the future.
\ No newline at end of file
/* Target only inline code */
p code, li code, td code {
word-break: keep-all;
white-space: nowrap;
}
/* Preserve formatting for multi-line code blocks */
pre code {
word-break: normal;
white-space: pre;
}
:root {
--md-tooltip-width: 600px;
}
\ No newline at end of file
---
hide:
- navigation
- toc
---
[NeMo-Skills](https://github.com/NVIDIA/NeMo-Skills) is a collection of pipelines to improve "skills" of large language models (LLMs). We support everything needed for LLM development, from synthetic data generation, to model training, to evaluation on a wide range of benchmarks. Start developing on a local workstation and move to a large-scale Slurm cluster with just a one-line change.
Here are some of the features we support:
- [Flexible LLM inference](basics/inference.md):
- Seamlessly switch between API providers, local server and large-scale Slurm jobs for LLM inference.
- Host models (on 1 or many nodes) with [TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [vLLM](https://github.com/vllm-project/vllm), [sglang](https://github.com/sgl-project/sglang) or [Megatron](https://github.com/NVIDIA/Megatron-LM).
- Scale SDG jobs from 1 GPU on a local machine all the way to tens of thousands of GPUs on a Slurm cluster.
- [Model evaluation](pipelines/evaluation.md):
- Evaluate your models on many popular benchmarks.
- Math problem solving: hmmt_feb25, brumo25, aime24, aime25, omni-math (and many more)
- Formal proofs in Lean: minif2f, proofnet
- Coding skills: scicode, livecodebench, human-eval, mbpp
- Chat/instruction following: ifbench, ifeval, arena-hard
- General knowledge: mmlu, mmlu-pro, gpqa
- Long context: ruler
- Easily parallelize each evaluation across many Slurm jobs, self-host LLM judges, bring your own prompts or change benchmark configuration in any other way.
- [Model training](pipelines/training.md): Train models using [NeMo-Aligner](https://github.com/NVIDIA/NeMo-Aligner/), [NeMo-RL](https://github.com/NVIDIA/NeMo-RL/) or [verl](https://github.com/volcengine/verl).
To get started, follow these [steps](basics/index.md), browse available [pipelines](./pipelines/index.md) or run `ns --help` to see all available
commands and their options.
You can find more examples of how to use NeMo-Skills in the [tutorials](./tutorials/index.md) page.
We've built and released many popular models and datasets using NeMo-Skills. See all of them in the [Papers & Releases](./releases/index.md) documentation.
\ No newline at end of file
# Checkpoint conversion
!!! info
This pipeline starting script is [nemo_skills/pipeline/convert.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/pipeline/convert.py)
All extra parameters are passed to one of the following scripts
* For conversion to NeMo:
- If `--model_type=llama`: [nemo_skills/conversion/hf_to_nemo_llama.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/conversion/hf_to_nemo_llama.py)
- If `--model_type=qwen`: [nemo_skills/conversion/hf_to_nemo_qwen.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/conversion/hf_to_nemo_qwen.py)
* For conversion to HuggingFace:
- If `--model_type=llama`: [nemo_skills/conversion/nemo_to_hf_llama.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/conversion/nemo_to_hf_llama.py)
- If `--model_type=qwen`: [nemo_skills/conversion/nemo_to_hf_qwen.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/conversion/nemo_to_hf_qwen.py)
You only need to convert models if you want to use NeMo-Aligner for training. All other server and training backends
should work with HuggingFace format.
To convert the checkpoint from one format to another use a command like this
```bash
ns convert \
--cluster=slurm \
--input_model=/hf_models/Meta-Llama-3.1-70B-Instruct \
--output_model=/trt_models/llama3.1-70b-instruct \
--convert_from=hf \
--convert_to=nemo \
--model_type=llama \
--num_gpus=8 \
--hf_model_name=meta-llama/Meta-Llama-3.1-70B-Instruct
```
You can provide any extra arguments that will be passed directly to the underlying conversion scripts.
# LLM-based data decontamination
!!! info
This pipeline starting script is [nemo_skills/pipeline/generate.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/pipeline/generate.py)
All extra parameters are passed to [nemo_skills/inference/check_contamination.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/inference/check_contamination.py)
We implemented an LLM-based data decontamination pipeline following
[lmsys methodology](https://lmsys.org/blog/2023-11-14-llm-decontaminator/).
There are two main ways how you can use this pipeline: to check existing dataset
for contamination and to decontaminate the training dataset by removing all
contaminated questions.
## To check for contamination
Let's say you want to check for contamination of [MATH](https://github.com/hendrycks/math)
training set with MATH, AMC-23 and AIME-24 test sets. First, get the data
```bash
ns prepare_data math amc23 aime24
```
Then we need to retrieve top-k similar questions from the training set. Assuming
you have `/workspace` defined in your [cluster config](../basics/cluster-configs.md)
you can do it in the following way
```python
from nemo_skills.pipeline.cli import wrap_arguments, run_cmd, generate
test_sets = ['math', 'amc23', 'aime24']
compare_to = ",".join(f"/nemo_run/code/nemo_skills/dataset/{test_set}/test.jsonl" for test_set in test_sets)
cmd = (
f"python -m nemo_skills.inference.retrieve_similar "
f" ++retrieve_from='/nemo_run/code/nemo_skills/dataset/math/train.jsonl' "
f" ++compare_to=\\\'{compare_to}\\\'"
f" ++output_file='/workspace/math-contamination-retrieved.jsonl' "
f" ++top_k=1 "
)
run_cmd(
cluster="local",
container="nemo",
num_gpus=1, # can increase this if you have more gpus
ctx=wrap_arguments(cmd),
)
```
Next, you need to run LLM inference to check those closest found questions from the output file. Here is an example
using Llama-405B from Nvidia API catalog, but you can replace it with OpenAI models or self-hosted models.
```python
generate(
cluster="local",
generation_type="check_contamination",
input_file="/workspace/math-contamination-retrieved.jsonl",
output_dir="/workspace/math-contamination-results",
model="meta/llama-3.1-405b-instruct",
server_type="openai",
server_address="https://integrate.api.nvidia.com/v1",
)
```
This script will print an output that looks like this
```
Contamination portion: 13.91% (705/5070)
```
## To decontaminate training data
If you want instead to clean your training data from contaminated examples all the commands stay the same, but
you need to swap values for the `retrieve_from` and `compare_to` arguments in the `retrieve_similar` step
since we now want to make a check for each training set example and find closest test set problems.
After you get `/workspace/math-contamination-results/output.jsonl`,
you can pass it into [prepare_data command](training.md#preparing-the-data)
with `++contamination_file=...` option.
See a more detailed example in [OpenMathInstruct-2 dataset construction pipeline](../releases/openmathinstruct2/dataset.md#decontamination).
\ No newline at end of file
# Model evaluation
!!! info
This pipeline starting script is [nemo_skills/pipeline/eval.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/pipeline/eval.py)
All extra parameters are passed to [nemo_skills/inference/generate.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/inference/generate.py)
We support many popular benchmarks and it's easy to add new in the future. E.g. we support
- Math problem solving: hmmt_feb25, brumo25, aime24, aime25, omni-math (and many more)
- Formal proofs in Lean: minif2f, proofnet
- Coding skills: livecodebench, human-eval, mbpp
- Chat/instruction following: ifeval, arena-hard
- General knowledge: mmlu, mmlu-pro, gpqa
- Long context: ruler
See [nemo_skills/dataset](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset) where each folder is a benchmark we support.
Here is how to run evaluation (using API model as an example,
but same command works with self-hosted models both locally and on slurm).
Make sure that `/workspace` is mounted inside of your
[cluster config](../basics/cluster-configs.md).
## Preparing data
You need to run the following commands to prepare the data.
```bash
ns prepare_data
```
If you're only interested in a subset of datasets (e.g. only math-related or code-related), run with
`--dataset_groups ...` and if you only need a couple of specific datasets, list them directly e.g.
```bash
ns prepare_data gsm8k human-eval mmlu ifeval
```
If you have the repo cloned locally, the data files will be available inside `nemo_skills/dataset/<benchmark>/<split>.jsonl`
and if you installed from pip, they will be downloaded to wherever the repo is installed, which you can figure out by running
```bash
python -c "import nemo_skills; print(nemo_skills.__path__)"
```
Some benchmarks (e.g. ruler) require extra parameters to be passed to the prepare_data script. Thus you'd need to explicitly
call `ns prepare_data` for all of them, e.g. for ruler you can use
```bash
ns prepare_data ruler --setup=llama_128k --tokenizer_path=meta-llama/Llama-3.1-8B-Instruct --max_seq_length=131072
```
## Greedy decoding
```bash
ns eval \
--cluster=local \
--server_type=openai \
--model=meta/llama-3.1-8b-instruct \
--server_address=https://integrate.api.nvidia.com/v1 \
--benchmarks=gsm8k,human-eval \
--output_dir=/workspace/test-eval
```
This will run evaluation on gsm8k and human-eval for Llama 3.1 8B model. If you're running
on slurm by default each benchmark is run in a separate job, but you can control this with
`--num_jobs` parameter.
After the evaluation is done, you can get metrics by calling
```bash
ns summarize_results --cluster local /workspace/test-eval
```
Which should print the following
```
---------------------------------------- gsm8k ----------------------------------------
evaluation_mode | num_entries | avg_tokens | gen_seconds | symbolic_correct | no_answer
pass@1 | 1319 | 180 | 164 | 81.96% | 4.93%
------------------------------------------- human-eval -------------------------------------------
evaluation_mode | num_entries | avg_tokens | gen_seconds | passing_base_tests | passing_plus_tests
pass@1 | 164 | 199 | 29 | 64.63% | 60.37%
```
The [summarize_results](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/pipeline/summarize_results.py) script
will fetch the results from cluster automatically if you ran the job there.
!!! note
The numbers above don't match reported numbers for Llama 3.1 because we are not using
the same prompts by default. You would need to modify the prompt config for each specific benchmark
to match the results exactly. E.g. to match gsm8k numbers add `++prompt_config=llama3/gsm8k`
(but we didn't include all the prompts used for Llama3 evaluation, only a small subset as an example).
## Using multiple samples
You can add `:<num repeats>` after the benchmark name to repeat evaluation multiple times with high temperature
that can be used for majority voting or estimating pass@k. E.g. if we run with
```bash
ns eval \
--cluster=local \
--server_type=openai \
--model=meta/llama-3.1-8b-instruct \
--server_address=https://integrate.api.nvidia.com/v1 \
--benchmarks gsm8k:4,human-eval:4 \
--output_dir=/workspace/test-eval
```
you will see the following output after summarizing results
```
---------------------------------------- gsm8k -----------------------------------------
evaluation_mode | num_entries | avg_tokens | gen_seconds | symbolic_correct | no_answer
pass@1[avg-of-4] | 1319 | 180 | 680 | 80.44% | 6.31%
majority@4 | 1319 | 180 | 680 | 88.40% | 0.15%
pass@4 | 1319 | 180 | 680 | 93.63% | 0.15%
-------------------------------------------- human-eval -------------------------------------------
evaluation_mode | num_entries | avg_tokens | gen_seconds | passing_base_tests | passing_plus_tests
pass@1[avg-of-4] | 164 | 215 | 219 | 64.63% | 59.30%
pass@4 | 164 | 215 | 219 | 79.27% | 74.39%
```
## Using data on cluster
Some benchmarks (e.g. ruler) have very large input datasets and it's inefficient to prepare them on local machine and
keep uploading on cluster with every evaluation job. Instead, you can prepare them on cluster directly. To do that,
run prepare_data command with `--data_dir` and `--cluster` options, e.g.
```bash
ns prepare_data \
--data_dir=/workspace/ns-data \
--cluster=slurm \
ruler --setup llama_128k --tokenizer_path meta-llama/Llama-3.1-8B-Instruct --max_seq_length 130900
```
Then during evaluation, you'd need to provide the same `data_dir` argument and it will read the data from cluster
directly. You can also use `NEMO_SKILLS_DATA_DIR` environment variable instead of an explicit argument.
Here is an example evaluation command for ruler that uses data_dir parameter
```python
from nemo_skills.pipeline.cli import eval, wrap_arguments
eval(
# using a low number of concurrent requests since it's almost entirely prefill stage
ctx=wrap_arguments("++max_concurrent_requests=32"),
cluster="slurm",
model="/hf_models/Meta-Llama-3.1-8B-Instruct",
server_type="sglang",
output_dir="/workspace/eval-ruler",
data_dir="/workspace/ns-data",
benchmarks="ruler.llama_128k",
server_gpus=8,
expname="eval-ruler",
)
```
## How the benchmarks are defined
Each benchmark exists as a separate folder inside
[nemo_skills/dataset](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset). Inside
those folders there needs to be `prepare.py` script which can be run to download and format benchmark
data into a .jsonl input file (or files if it supports train/validation besides a test split) that
our scripts can understand. There also needs to be an `__init__.py` that defines some default variables
for that benchmark, such as prompt config, evaluation type, metrics class and a few more.
This information is than used inside eval pipeline to initialize default setup (but all arguments can
be changed from the command line).
Let's look at gsm8k to understand a bit more how each part of the evaluation works.
Inside [nemo_skills/dataset/gsm8k/\_\_init\_\_.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset/gsm8k/__init__.py) we see the following
```python
# settings that define how evaluation should be done by default (all can be changed from cmdline)
PROMPT_CONFIG = 'generic/math'
DATASET_GROUP = 'math'
METRICS_TYPE = "math"
EVAL_ARGS = "++eval_type=math"
GENERATION_ARGS = ""
```
The prompt config and default generation arguments are passed to the
[nemo_skills/inference/generate.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/inference/generate.py) and
the default eval args are passed to the
[nemo_skills/evaluation/evaluate_results.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/evaluation/evaluate_results.py).
The dataset group is used by [nemo_skills/dataset/prepare.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/dataset/prepare.py)
to help download only benchmarks from a particular group if `--dataset_groups` parameter is used.
Finally, the metrics class is used by [nemo_skills/evaluation/metrics.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/evaluation/metrics.py)
which is called when you run summarize results pipeline.
To create a new benchmark in most cases you only need to add a new prepare script and the corresponding
default prompt. If the new benchmark needs some not-supported post-processing or metric summarization
you'd need to also add a new evaluation type and a new metrics class.
\ No newline at end of file
# Pipelines
## Basics
NeMo-Skills has a large collection of building blocks that you can use to construct various pipelines to improve LLMs.
All of the "pipeline" scripts are located in the [nemo_skills/pipeline](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/pipeline)
folder and have a unified interface that help us connect them together.
Each pipeline script is a wrapper that accepts *wrapper* arguments that tell us how to orchestrate the job. These
arguments are directly listed in the corresponding Python function or visible when you run `ns <wrapper script> --help`.
Any other arguments that you pass to the wrapper script are directly passed into the *main* job that the wrapper
launches. These arguments are never checked when you submit a job, so if you have some mistake in them, you will only
know about that when the job starts running. For most of our *main* scripts we use [Hydra](https://hydra.cc/) and thus
their arguments typically start with `++arg_name`. If you're using Python API you would need to specify all *main* arguments with
`ctx=wrap_arguments("...")` interface for technical reasons.
This might sound a little complicated, so let's see how it works through an example from the [Getting Started Tutorial](../basics/index.md).
=== "ns interface"
```bash
ns generate \
--cluster=local \
--server_type=trtllm \
--model=/workspace/qwen2.5-1.5b-instruct-trtllm \
--server_gpus=1 \
--output_dir=/workspace/generation-local-trtllm \
--input_file=/workspace/input.jsonl \
++prompt_config=/workspace/prompt.yaml \
++prompt_template=qwen-instruct
```
=== "python interface"
```python
from nemo_skills.pipeline.cli import wrap_arguments, generate
generate(
cluster="local",
server_type="trtllm",
model="/workspace/qwen2.5-1.5b-instruct-trtllm",
server_gpus=1,
output_dir="/workspace/generation-local-trtllm",
input_file="/workspace/input.jsonl",
ctx=wrap_arguments(
"++prompt_config=/workspace/prompt.yaml "
"++prompt_template=qwen-instruct "
),
)
```
In this command all arguments starting with `--` are *wrapper* arguments and everything starting with `++` are *main* arguments.
If you run `ns generate --help` you will see all the ones with `--` listed there (and more), but not the `++` ones.
The help output also contains this message that specifies which underlying *main* script we run for this command and how
to check its arguments
```bash
`python -m nemo_skills.inference.generate --help` for other supported arguments
```
You can also open that script's code in
[nemo_skills/inference/generate.py](https://github.com/NVIDIA/NeMo-Skills/tree/main/nemo_skills/inference/generate.py)
and see all arguments and logic there.
You can chain multiple pipelines together to set proper slurm dependencies using `--run_after` parameter.
There is an example in [tutorial](../basics/index.md#slurm-inference) or in
[training documentation](training.md#chaining-pipelines-with-python).
## Common parameters
Many of our scripts have a shared set of common parameters that we list here.
### All pipeline scripts
All scripts inside pipeline folder have the following parameters.
- **--cluster**: You always need to specify a cluster config that will be used to
control where the job is executed.
- **--config_dir**: By default we search for cluster configs inside `cluster_configs`
local folder, but you can control where they are located with this parameter.
You can also use `NEMO_SKILLS_CONFIG_DIR` environment variable for this purpose.
- **--log_dir**: Can be used to customize the location of slurm logs.
- **--expname**: You can always specify an experiment name, which is a
[NeMo-Run](https://github.com/NVIDIA/NeMo-Run) concept. This will control where
the metadata is stored, the slurm job name and allows you to chain jobs one
after the other using the `--run_after` argument.
- **--run_after**: Can be used in conjunction with `--expname` to chain jobs to
run one after another (only applicable on slurm). E.g. run training job with
`--expname my-training-run` and then launch an eval with `--run_after my-training-run`.
- **--mount_paths**: Can be used to mount additional paths to the cluster config dynamically.
This is useful if you want to access some data that is not mounted in cluster config. E.g. use
`--mount_paths /my/remote/workspace:/workspace` to mount `/workspace` folder from the host
machine to the slurm job.
- **--check_mounted_paths**: This flag offers a few different capabilities for convenience:
- Check if the paths specified in the script are mounted correctly. This is useful if you want to make
sure that the paths that are mounted are available on remote machine before running the job.
E.g. use `--check_mounted_paths` to check if `/my/remote/workspace` folder from the host machine
is a folder that exists and can be mounted.
- In many cases, if the directory does not exist, we will create it for you. This is useful for
output and log directories.
- If paths are provided but not mounted, often times we will dynamically mount them for you.
- **--partition**: Can be used to run in a specific slurm partition (e.g. commonly used
to launch interactive jobs).
- **--not_exclusive**: Can be used if you want to request a part o the slurm node. By default
we set `exclusive=True`.
- **--time_min**: Can be used to specify minimum time after which the job might be killed by slurm.
Specify in the following format `00:30:00` (for 30 minutes). Using a lower value will help jobs
get scheduled faster.
- **--reuse_code** / **--reuse_code_exp**: Can be used to specify another experiment and reuse
its code (to avoid re-packaing/uploading to cluster). If running from Python we will automatically
reuse the last submitted experiment in the current Python session.
### Generation scripts
All of the scripts that involve LLM data generation accept a common set of parameters.
- **--model**: Either path to the model file or an API model name.
- **--server_type**: `nemo`, `trtllm`, `vllm` or `openai`. This is used on the client side
to correctly format a request to a particular server. This needs to match model
checkpoint format if self-hosting the model or has to be `openai` for both Nvidia
NIM API as well as the OpenAI API.
- **--server_address**: Only relevant for API models. E.g. use
`https://integrate.api.nvidia.com/v1` for Nvidia API and
`https://api.openai.com/v1` for OpenAI API.
- **--server_gpus**: Number of GPUs needed to host a model (only applicable to self-hosted models).
- **--server_nodes**: Number of nodes needed to host a model (only applicable to self-hosted models).
- **--server_args**: Any other arguments you need to pass to a corresponding server.
E.g. use `--server_args="--gpu-memory-utilization=0.99"` to change gpu memory utilization of a
vLLM server.
\ No newline at end of file
# LLM-as-a-judge for math evaluation
!!! info
This pipeline starting script is [nemo_skills/pipeline/generate.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/pipeline/generate.py)
All extra parameters are passed to [nemo_skills/inference/llm_math_judge.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/inference/llm_math_judge.py)
When evaluating complex mathematical questions, it's very hard to have a rule-based symbolic comparison system.
While we do perform such comparison by default, for most accurate results it's best to use LLM-as-a-judge pipeline.
E.g. symbolic comparison can perform very inaccurately for multi-choice questions where an answer might either be
one of the letters or an expression corresponding to that letter.
If you have an output of the [evaluation script](evaluation.md) on e.g. math benchmark, you can run LLM-as-a-judge
in the following way (assuming you have `/workspace` mounted in your [cluster config](../basics/cluster-configs.md)
and evaluation output available in `/workspace/test-eval/eval-results`).
```bash
ns generate \
--generation_type=math_judge \
--cluster=local \
--model=gpt-4o \
--server_type=openai \
--server_address=https://api.openai.com/v1 \
--output_dir=/workspace/test-eval-judge/eval-results/math \
--input_dir=/workspace/test-eval/eval-results/math \
--num_random_seeds=<num seeds used for generation>
```
This will run the judge pipeline on the data inside `eval-results/math` folder and judge solutions from `output-rsX.jsonl` files.
If you ran the benchmark with a single generation (e.g. using `math` or `math:0`) then
use `--input_file=/workspace/test-eval/eval-results/math/output.jsonl` instead of `--input_dir` and `--num_random_seeds` arguments.
In this example we use gpt-4o from OpenAI, but you can use Llama-405B (that you can host on cluster yourself) or any
other models. If you have multiple benchmarks, you would need to run the command multiple times.
After the judge pipeline has finished, you can see the results by running
```bash
ns summarize_results /workspace/test-eval-judge --cluster local
```
Which should output something like this
```
------------------------------------------------- aime24 ------------------------------------------------
evaluation_mode | num_entries | symbolic_correct | judge_correct | both_correct | any_correct | no_answer
pass@1 | 30 | 20.00 | 20.00 | 20.00 | 20.00 | 13.33
------------------------------------------------- gsm8k -------------------------------------------------
evaluation_mode | num_entries | symbolic_correct | judge_correct | both_correct | any_correct | no_answer
pass@1 | 1319 | 95.00 | 95.75 | 95.00 | 95.75 | 0.00
-------------------------------------------------- math -------------------------------------------------
evaluation_mode | num_entries | symbolic_correct | judge_correct | both_correct | any_correct | no_answer
pass@1 | 5000 | 67.32 | 67.88 | 67.02 | 68.18 | 2.64
------------------------------------------------- amc23 -------------------------------------------------
evaluation_mode | num_entries | symbolic_correct | judge_correct | both_correct | any_correct | no_answer
pass@1 | 40 | 47.50 | 47.50 | 47.50 | 47.50 | 7.50
```
If you want to see where symbolic comparison differs from judge comparison, run with `--debug` option.
We use the following [judge prompt](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/prompt/config/judge/math.yaml)
by default, but you can customize it the same way as you [customize any other prompt](../basics/prompt-format.md).
\ No newline at end of file
# Running arbitrary commands
!!! info
This pipeline starting script is [nemo_skills/pipeline/run_cmd.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/pipeline/run_cmd.py)
All extra parameters are directly executed as a shell command.
We often need to run arbitrary pre/post processing commands as part of a larger pipeline and thus we provide a simple
`run_cmd` utility that can be used to schedule those on slurm. Here is an example that simply enters the packaged
code and tries to install it (will finish with error if not running from NeMo-Skills repo or other installable package).
```bash
ns run_cmd --cluster=local cd /nemo_run/code/ && pip install -e .
```
There are many more examples of how to use `run_cmd` throughout our documentation.
## LLM Server and Sandbox Server
While we can run arbitrary commands with the default `run_cmd` script, we also provide the ability to
run a LLM server with the `--model` argument and a few extra arguments for the server config. These arguments
are similar to the ones used for `start_server` script.
This can be useful to run a server on a local machine or on a cluster with GPUs in a slurm job, while also being able to
run arbirary code that uses LLM calls.
### Example
Say you have the following inference file that uses OpenAI API with a vLLM backed server (say to run a
project that is compatible with OpenAI API). Imagine a file called `inference.py` with the following code:
```python
from openai import OpenAI
client = OpenAI(api_key='EMPTY', base_url=f"http://0.0.0.0:5000/v1", timeout=None)
api_model = client.models.list().data[0].id
response = client.chat.completions.create(
model=api_model,
messages=[
{"role": "user", "content": "What is the capital of France?"},
],
temperature=0.7,
max_tokens=128,
top_p=0.95,
n=1,
stream=False,
)
print(response.choices[0].message.content)
```
Then we can run the server and the inference code in a single command as below. The --with_sandbox argument starts the
code execution server that can be used to run arbitrary code in a sandboxed environment and is added here just as a
demonstration. While the current example does not use it, this can be useful to execute code or to run code that
requires a specific environment in a container.
**Note**: While the container is a little more secure than running code directly on the host, it is still not a fully
secure sandbox and should not be used to run untrusted code.
```bash
ns run_cmd \
--cluster=local \
--model=Qwen/Qwen3-1.7B \
--server_type=vllm \
--server_gpus=1 \
--with_sandbox \
cd /nemo_run/code/ && python inference.py
```
This will launch the LLM inference server, the sandbox server and then run the inference code.
\ No newline at end of file
# Training using verl or OpenRLHF
!!! info
Depending on the algorithm/framework, this pipeline starting script is
* [nemo_skills/pipeline/openrlhf/sft.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/pipeline/openrlhf/sft.py)
* [nemo_skills/pipeline/openrlhf/ppo.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/pipeline/openrlhf/sft.py)
* [nemo_skills/pipeline/verl/ppo.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/pipeline/verl/ppo.py)
All extra parameters are passed to
* [openrlhf.cli.train_sft](https://github.com/OpenRLHF/OpenRLHF/blob/main/openrlhf/cli/train_sft.py)
* [openrlhf.cli.train_ppo_ray](https://github.com/OpenRLHF/OpenRLHF/blob/main/openrlhf/cli/train_ppo_ray.py)
* [verl.trainer.main_ppo](https://github.com/volcengine/verl/blob/main/verl/trainer/main_ppo.py)
!!! warning
OpenRLHF support is experimental and incomplete. We use the following
custom fork and it might not be easy to switch to official repositories versions.
* OpenRLHF: https://github.com/Kipok/OpenRLHF
The documentation here is incomplete and we advise you to open an issue if you
plan to try something that is not covered below to get additional support.
For OpenRLHF, please use the following non-default container `vllm: igitman/nemo-skills-vllm:0.6.0`
## SFT with OpenRLHF
Here is an example of running SFT job with OpenRLHF.
Our standard [SFT data format](./training.md#preparing-the-data) can be
used here.
```bash
from nemo_skills.pipeline.cli import wrap_arguments, sft_openrlhf
sft_openrlhf(
ctx=wrap_arguments(""),
cluster="slurm",
expname="test-openrlhf-sft",
output_dir="/workspace/test-openrlhf-sft",
hf_model="/hf_models/Qwen2.5-1.5B-Instruct",
training_data="/data/sft-data.jsonl",
num_gpus=8,
num_nodes=2,
num_training_jobs=1,
)
```
## PPO with OpenRLHF
Here is an example of running PPO job with OpenRLHF.
Our standard [SFT data format](./training.md#preparing-the-data) can be
used here.
```python
from nemo_skills.pipeline.cli import wrap_arguments, ppo_openrlhf
ppo_openrlhf(
ctx=wrap_arguments(
"--ref_num_gpus_per_node=4 "
"--actor_num_gpus_per_node=4 "
"--vllm_num_engines=2 "
"--vllm_tensor_parallel_size=2 "
"--ref_num_nodes=1 "
"--actor_num_nodes=1 "
"--colocate_actor_ref "
"--advantage_estimator=reinforce "
"--remote_rm_url /nemo_run/code/nemo_skills/training/openrlhf/math_reward.py "
),
cluster="slurm",
expname="test-openrlhf-ppo",
output_dir="/workspace/test-openrlhf-ppo",
hf_model="/hf_models/Qwen2.5-1.5B-Instruct",
prompt_data="/data/rl-data.jsonl",
num_gpus=8,
num_nodes=2,
# this is used for the LLM judge
server_gpus=8,
server_type='trtllm',
server_model='/hf_models/Qwen2.5-32B-Instruct',
num_training_jobs=1,
)
```
## PPO with verl
Here is an example of running PPO job with verl.
You can use [nemo_skills/training/verl/prepare_data.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/training/verl/prepare_data.py) to convert
our standard [SFT data format](./training.md#preparing-the-data) into parquet.
```python
from nemo_skills.pipeline.cli import wrap_arguments, ppo_verl
ppo_verl(
ctx=wrap_arguments(
'++trainer.save_freq=0 '
'++data.train_batch_size=32 '
'++reward_model.compute_score=math-judge '
'++reward_model.reward_manager=batched '
'++data.filter_prompts=False '
'++actor_rollout_ref.rollout.gpu_memory_utilization=0.7 '
'++data.max_response_length=12000 '
'++actor_rollout_ref.rollout.n=64 '
'++actor_rollout_ref.rollout.tensor_model_parallel_size=2 '
),
cluster="slurm",
expname="test-verl-ppo",
output_dir="/workspace/test-verl-ppo",
hf_model="/hf_models/Qwen2.5-1.5B-Instruct",
prompt_data="/data/rl-data.parquet",
num_gpus=8,
num_nodes=2,
# this is used for the LLM judge
server_gpus=8,
server_type='trtllm',
server_model='/hf_models/Qwen2.5-32B-Instruct',
num_training_jobs=1,
)
```
\ No newline at end of file
# Training using NeMo-Aligner
!!! info
This pipeline starting script is [nemo_skills/pipeline/train.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/pipeline/train.py)
All extra parameters are passed to either [nemo_skills/training/start_sft.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/training/start_sft.py) or [nemo_skills/training/start_dpo.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/training/start_dpo.py)
## Preparing the data
Before running the training we need to prepare the data in the right format. Here is an example command
```bash
python -m nemo_skills.training.prepare_data \
++input_files="<path to the generated synthetic data>/output-rs*.jsonl"> \
++output_path=sft-data.jsonl \
++prompt_config=generic/math \
++prompt_template=llama3-instruct
```
!!! tip
Many scripts accept `++input_files` argument. You can use any glob patterns there and also
reference multiple files/patterns separated by space or comma.
If you want to run that command inside container or on cluster, add `ns run_cmd --cluster=...` in the beginning.
You need to pass in the config/template files so that we can format the data accordingly. There are many more parameters
that data preparation script supports which you can see
[here](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/training/data_preparation_utils/config/math_sft.yaml).
We are using [SDP library](https://github.com/NVIDIA/NeMo-speech-data-processor) for preparing the data, so it's
a good idea to check their documentation to understand how this config is structured.
!!! note
Even though we support both SFT and DPO training, the data preparation is currently only implemented
for SFT jobs. For DPO, you'd need to manually prepare the data according to the
[NeMo-Aligner documentation](https://docs.nvidia.com/nemo-framework/user-guide/latest/modelalignment/dpo.html#dpo-model-training)
## Running training
We use [NeMo-Aligner](https://github.com/NVIDIA/NeMo-Aligner/) to run LLM training,
so you can check their documentation to learn about all supported parameters.
Here is an example of how to run a training job.
```bash
ns train \
--cluster=slurm \
--expname=my-training-job \
--output_dir=/workspace/my-training-job/checkpoints \
--nemo_model=/nemo_models/llama3.1-8b-base \
--num_nodes=8 \
--num_gpus=8 \
--num_training_jobs=4 \
--training_data=/data/sft-data.jsonl
```
This will run training on 8 nodes of 8 GPUs, using 4 dependent slurm jobs.
By default we are training for 2 epochs, saving checkpoints every 1000 steps,
but you can adjust these values. It's also recommended to tune micro batch size
and tensor parallel parameters for optimal performance. E.g. these are good
defaults for an 8B model size
```bash
++model.data.train_ds.micro_batch_size=4 \
++model.tensor_model_parallel_size=4
```
You can customize any of the SFT parameters by directly providing them, e.g.
to disable wandb logging and add dropout use
```bash
--disable_wandb \
++model.ffn_dropout=0.1 \
++model.attention_dropout=0.1 \
++model.hidden_dropout=0.1
```
The training script will average all of your generated checkpoints upon completion
(we found this to consistently increase the downstream accuracy). If you want to
only average a subset of checkpoint, add `--average_steps` parameter (e.g. if you
want to disable averaging, set it to the last training step). If you only want
to average the checkpoints of the finished job, set `--num_training_jobs=0`.
## Chaining pipelines with Python
Typically after training we want to follow up with evaluation. You can schedule
an evaluation job right away by providing a `--run_after=my-training-job` argument
which will appropriately set slurm dependencies. Here is how you can chain the commands
to schedule checkpoint conversion and evaluation after training
(whenever you need to run multiple commands, it's more convenient to use python interface)
```python
from nemo_skills.pipeline.cli import wrap_arguments, train, convert, eval
expname = "my-training-job"
cluster = "slurm"
output_dir = f"/workspace/{expname}/checkpoints"
train(
ctx=wrap_arguments(""),
cluster=cluster,
expname=expname,
output_dir=output_dir,
nemo_model="/nemo_models/llama3.1-8b-base",
num_nodes=8,
num_gpus=8,
num_training_jobs=4,
training_data="/data/sft-data.jsonl",
)
convert(
ctx=wrap_arguments(""),
cluster=cluster,
input_model=f"{output_dir}/model-averaged-nemo",
output_model=f"{output_dir}/model-averaged-hf",
expname=f"{expname}-to-hf",
run_after=expname,
convert_from="nemo",
convert_to="hf",
model_type="llama",
num_gpus=8,
hf_model_name="meta-llama/Meta-Llama-3.1-8B",
)
eval(
ctx=wrap_arguments("++prompt_template=llama3-instruct"),
cluster=cluster,
model=f"{output_dir}/model-averaged-hf",
server_type="trtllm",
output_dir=f"{output_dir}/results/",
benchmarks="gsm8k,math",
server_gpus=8,
run_after=f"{expname}-to-hf",
)
```
## Using sequence packing and context parallel
When training on sequences >4k or so, it's recommended to use sequence packing and context parallel.
Here is an example how to do that. Most of the parameters don't need to change, but
the `global_batch_size` might need to be adjusted to be n times smaller than without packing
where n is the average number of sequences per pack, that packing script outputs, e.g.
```
[NeMo I 2025-01-16 13:57:37 prepare_packed_ft_dataset:165] Packing sequences to length 16384...
[NeMo I 2025-01-16 15:06:24 prepare_packed_ft_dataset:182] Packing is 98.23% efficient
[NeMo I 2025-01-16 15:06:24 prepare_packed_ft_dataset:183] >>>>> For pack size 16384, average number of sequences per pack is n = 3.669 <<<<<
```
Here is an example of running packing and training.
```python
from nemo_skills.pipeline.cli import wrap_arguments, train, run_cmd
expname = "my-training-job"
cluster = "slurm"
output_dir = f"/workspace/{expname}/checkpoints"
# your memory consumption will be similar to a job with
# `pack_seq_length / context_parallel` sequences without packing
pack_seq_length = 16384
context_parallel = 4
original_bs = 512
avg_sequences_per_pack = 3.7
# you need to make sure this is divisible by your data parallel rank,
# so might need to round to a power of 2
packed_bs = original_bs // avg_sequences_per_pack
# Make sure that train_ds.file_names is included in the bucket e.g., [/data/sft-data.jsonl]
packing_cmd = (
f"python /nemo_run/code/nemo_skills/training/prepare_packed_ft_dataset.py "
f" ++model.data.train_ds.file_names=[/data/sft-data.jsonl] "
f" ++model.data.train_ds.max_seq_length={pack_seq_length} "
f" ++model.context_parallel_size={context_parallel} "
f" ++tokenizer_path=/hf_models/Meta-Llama-3.1-8B "
f" ++output_dir=/data "
f" ++pack_sizes=[{pack_seq_length}] "
f" ++model.data.train_ds.hf_dataset=True "
)
run_cmd(
ctx=wrap_arguments(packing_cmd),
cluster=cluster,
expname=f"{expname}-packing",
container="nemo", # please use "nemo container" for packed data prepration
# this is a cpu-only operation, so if a cluster has a good cpu partition, it can be used
# note that this is an expensive operation requiring a lot of CPUs and RAM
)
# The `packing_cmd` generates three files when `pack_seq_length=16384` is used, for example:
# `packed_16384_seed0.input_ids.npy`
# `packed_16384_seed0.loss_mask.npy`
# `packed_16384_seed0.seq_start_id.npy`
# For training, set training_data=packed_16384_seed0.npy
# Refer to the _load_dataset_alt function in nemo_skills/training/gpt_sft_dataset.py for details on why this is required.
train(
ctx=wrap_arguments(
f"++model.data.train_ds.packed_sequence=True "
f"++model.data.train_ds.micro_batch_size=1 " # should always be 1 for packed jobs
f"++model.data.train_ds.global_batch_size={packed_bs} "
f"++model.context_parallel_size={context_parallel} "
f"++model.data.train_ds.max_seq_length={pack_seq_length} "
# all other parameters are generally the same as for the non-packed job with
# max seq length = packed_seq_length / context_parallel
# and keep in mind that each step now processes avg_sequences_per_pack * packed_bs examples
),
cluster=cluster,
expname=expname,
run_after=f"{expname}-packing",
output_dir=output_dir,
nemo_model="/nemo_models/llama3.1-8b-base",
num_nodes=8,
num_gpus=8,
num_training_jobs=4,
training_data=f"/data/packed_{pack_seq_length}_seed0.npy",
)
# can follow up with the same convert/eval steps as above
```
If your data size is very large (i.e. >1M samples), you might run out of memory when doing packing on full data.
If that's the case, it's recommended to split data into smaller chunks and then merge them using
[nemo_skills/training/merge_packed_data.py](https://github.com/NVIDIA/NeMo-Skills/blob/main/nemo_skills/training/merge_packed_data.py)
Example command:
```bash
python nemo_skills/training/merge_packed_data.py \
--input_prefixes <chunk 1 folder>/packed_24576_seed0 <chunk 2 folder>/packed_24576_seed0 \
--output_prefix <final data folder>/packed_24576_seed0
```
---
title: Papers & Releases
hide:
- toc
---
On this page you can find a list of papers, model and dataset releases that were created using NeMo-Skills.
## Releases
* [OpenReasoning](openreasoning/index.md) models
* [OpenCodeReasoning](opencodereasoning/index.md) dataset and models
* [OpenMathReasoning](openmathreasoning/index.md) dataset and models
* [OpenMathInstruct-2](openmathinstruct2/index.md) dataset and models
## Papers
* [GenSelect: A Generative Approach to Best-of-N](https://openreview.net/pdf?id=8LhnmNmUDb){:target="_blank"} (2025)
* [The Challenge of Teaching Reasoning to LLMs Without RL or Distillation](https://arxiv.org/abs/2507.09850){:target="_blank"} (2025)
* [OpenCodeReasoning: Advancing Data Distillation for Competitive Coding](https://arxiv.org/abs/2504.01943){:target="_blank"} (2025)
* [AIMO-2 Winning Solution: Building State-of-the-Art Mathematical Reasoning Models with OpenMathReasoning dataset](https://arxiv.org/abs/2504.16891){:target="_blank"} (2025)
* [OpenMathInstruct-2: Accelerating AI for Math with Massive Open-Source Instruction Data](https://arxiv.org/abs/2410.01560){:target="_blank"} (2024)
* [OpenMathInstruct-1: A 1.8 Million Math Instruction Tuning Dataset](https://arxiv.org/abs/2402.10176){:target="_blank"} (2024)
# Dataset construction
[OpenCodeReasoning-1](https://huggingface.co/datasets/nvidia/OpenCodeReasoning) and [OpenCodeReasoning-2](https://huggingface.co/datasets/nvidia/OpenCodeReasoning-2) dataset consists of competitve coding problems collected from [TACO](https://huggingface.co/datasets/BAAI/TACO), [APPS](https://huggingface.co/datasets/codeparrot/apps), [CodeContests](https://huggingface.co/datasets/deepmind/code_contests) and [CodeForces](https://huggingface.co/datasets/open-r1/codeforces). Below we describe the pipeline used to create this dataset. All relevant scripts are available in
[recipes/opencodereasoning](https://github.com/NVIDIA/NeMo-Skills/tree/main/recipes/opencodereasoning) folder.
If you don't have a slurm cluster with a large number of GPUs,
you can still try out all the steps of our pipeline by using [Nvidia NIM models](https://build.nvidia.com/). You can extract the questions set in its entirety following the [prepare_questions.py script](https://github.com/NVIDIA/NeMo-Skills/tree/main/recipes/opencodereasoning/pipeline/prepare_questions.py) and you can
switch to that data and NIM models by adding `--mode demo` to the pipeline commands. We also use different models
in this "demo" mode to make it faster, but you can change [configs/demo.yaml](https://github.com/NVIDIA/NeMo-Skills/tree/main/recipes/opencodereasoning/configs/demo.yaml) to pick
any other models supported in https://build.nvidia.com. Make sure to define `NVIDIA_API_KEY` environment variable for this to work
(and ignore scraping and model preparation steps as they are not needed when using NIM models).
Finally, please make sure to go through the
[getting started documentation](../../basics/index.md) to make sure you understand how the below commands
work and avoid running into errors.
## Data preparation (Question set)
The question set is preprocessed as part of the [prepare_questions.py](https://github.com/NVIDIA/NeMo-Skills/tree/main/recipes/opencodereasoning/pipeline/prepare_questions.py) script. This script will download the original datasets, extract just the questions and filter out super long instructions that may interfere with training.
**Note**: OCR-1 questions are a subset of OCR-2 questions, and it is recommended to generate data for OCR-2 directly.
To download and preprocess the question set you can run the following script. We assume out /workspace points to the directory where NeMo-Skills is cloned, but you can change the output directory to any other location:
```bash
python prepare_questions.py --cluster local --expname "toy" --output_dir "/workspace/recipes/opencodereasoning/data/"
```
This script will download the 4 individual seed datasets above, along with the OpenCodeReasoning-2 dataset in order to perform a mapping from question ids to questions, gather the unique questions in the dataset, truncate the discussions that are longer than 3200 Qwen 2.5 tokens. The prepared data will be saved as `open_code_reasoning_questions.jsonl`.
The output file should have ~34K rows, so all of the following commands will take a very long time and require a big
number of GPUs if you want to run them on full data. If you just want to try out the full pipeline, we recommend to subsample
the dataset by e.g. running
```bash
mv open_code_reasoning_questions.jsonl open_code_reasoning_questions_full.jsonl
head -n 1000 open_code_reasoning_questions_full.jsonl > open_code_reasoning_questions.jsonl
```
**Note**: The questions from this dataset are already decontaminated against LiveCodeBench v6 2408-2505. However if you are evaluating against a newer version of LiveCodeBench, you may need to perform decontamination yourself. You can follow the instructions here to construct [decontamination pipeline](https://nvidia.github.io/NeMo-Skills/pipelines/decontamination/).
## Solution generation pipeline
[Solution generation pipeline](https://github.com/NVIDIA/NeMo-Skills/tree/main/recipes/opencodereasoning/pipeline/prepare_solutions.py)
consists of the following stages:
1. Generate solutions using some reasoning model for each of the prepared problems (`generate_solutions` stage).
2. Filter the solutions based on whether the reasoning trace completed successfully or not (`filter_solutions` stage).
You can run the full pipeline with
```
python recipes/opencodereasoning/pipeline/prepare_solutions.py --mode r1
```
You can specify a subset of stages using `--stages` argument, e.g. `--stages generate_solutions` or `--stages generate_solutions,filter_solutions`.
If you want to run using [Nvidia NIM models](https://build.nvidia.com/models) , change to `--mode demo`.
# Model evaluation
Here are the commands you can run to reproduce our evaluation numbers.
The commands below are for [nvidia/OpenCodeReasoning-Nemotron-1.1-7B](https://huggingface.co/nvidia/OpenCodeReasoning-Nemotron-1.1-7B) model as an example.
We assume you have `/workspace` defined in your [cluster config](../../basics/cluster-configs.md) and are
executing all commands from that folder locally. Change all commands accordingly
if running on slurm or using different paths.
## Download models
Get the model from HF. E.g.
```bash
# cd into your /workspace folder
pip install -U "huggingface_hub[cli]"
huggingface-cli download nvidia/OpenCodeReasoning-Nemotron-1.1-7B --local-dir OpenCodeReasoning-Nemotron-1.1-7B
```
## Prepare evaluation data
```bash
ns prepare_data livecodebench
```
## Run evaluation
```bash
ns eval \
--cluster=local \
--model=/workspace/OpenCodeReasoning-Nemotron-1.1-7B \
--server_type=vllm \
--output_dir=/workspace/OpenCodeReasoning-Nemotron-1.1-7B-eval \
--benchmarks=livecodebench:8 \
--split=test_v6_2408_2505 \
--server_gpus=1 \
++prompt_template=qwen-instruct \
++inference.tokens_to_generate=64000
```
Finally, to print the metrics run
```bash
ns summarize_results /workspace/OpenCodeReasoning-Nemotron-1.1-7B-eval/eval-results --cluster local
```
The numbers may vary by 1-2% depending on the server type, number of GPUs and batch size used.
# OpenCodeReasoning
This section has instructions for training a model that attains results similar to
[OpenCodeReasoning](https://arxiv.org/abs/2504.01943).
Please note that unless you have an access to a large GPU cluster, it might take a very long time
for some of the commands to complete!
- [Model evaluation](evaluation.md)
- [Dataset construction](dataset.md)
# Model evaluation
Here are the commands you can run to reproduce our evaluation numbers.
The commands below are for OpenMath-2-Llama3.1-8b model as an example.
We assume you have `/workspace` defined in your [cluster config](../../basics/cluster-configs.md) and are
executing all commands from that folder locally. Change all commands accordingly
if running on slurm or using different paths.
## Download models
Get the model from HF. E.g.
```bash
pip install -U "huggingface_hub[cli]"
huggingface-cli download nvidia/OpenMath2-Llama3.1-8B --local-dir OpenMath2-Llama3.1-8B
```
## Prepare evaluation data
```bash
ns prepare_data gsm8k math amc23 aime24 omni-math
```
## Run greedy decoding
```bash
ns eval \
--cluster=local \
--model=/workspace/OpenMath2-Llama3.1-8B \
--server_type=trtllm \
--output_dir=/workspace/openmath2-llama3.1-8b-eval \
--benchmarks=aime24,amc23,math,gsm8k,omni-math \
--server_gpus=1 \
--num_jobs=1 \
++prompt_template=llama3-instruct \
++inference.tokens_to_generate=4096
```
If running on slurm, you can set `--num_jobs` to a bigger number of -1 to run
each benchmark in a separate node. The number of GPUs need to match what you used
in the conversion command.
After the generation is done, we want to run LLM-as-a-judge evaluation to get more
accurate numbers than symbolic comparison. You need to define `OPENAI_API_KEY` for
the command below to work.
```bash
for dataset in aime24 amc23 math gsm8k omni-math; do
ns generate \
--generation_type=math_judge \
--cluster=local \
--model=gpt-4o \
--server_type=openai \
--server_address=https://api.openai.com/v1 \
--output_dir=/workspace/openmath2-llama3.1-8b-eval-judged/eval-results/${dataset} \
--input_dir=/workspace/openmath2-llama3.1-8b-eval/eval-results/${dataset}
done
```
Finally, to print the metrics run
```bash
ns summarize_results /workspace/openmath2-llama3.1-8b-eval-judged/eval-results --cluster local
```
This should print the metrics including both symbolic and judge evaluation. The judge is typically more accurate.
```
------------------------------------------------- aime24 ------------------------------------------------
evaluation_mode | num_entries | symbolic_correct | judge_correct | both_correct | any_correct | no_answer
pass@1 | 30 | 10.00 | 10.00 | 10.00 | 10.00 | 6.67
------------------------------------------------- gsm8k -------------------------------------------------
evaluation_mode | num_entries | symbolic_correct | judge_correct | both_correct | any_correct | no_answer
pass@1 | 1319 | 90.75 | 91.70 | 90.75 | 91.70 | 0.00
----------------------------------------------- omni-math -----------------------------------------------
evaluation_mode | num_entries | symbolic_correct | judge_correct | both_correct | any_correct | no_answer
pass@1 | 4428 | 18.97 | 22.22 | 18.11 | 23.08 | 2.55
-------------------------------------------------- math -------------------------------------------------
evaluation_mode | num_entries | symbolic_correct | judge_correct | both_correct | any_correct | no_answer
pass@1 | 5000 | 67.70 | 68.10 | 67.50 | 68.30 | 1.36
------------------------------------------------- amc23 -------------------------------------------------
evaluation_mode | num_entries | symbolic_correct | judge_correct | both_correct | any_correct | no_answer
pass@1 | 40 | 32.50 | 40.00 | 32.50 | 40.00 | 0.00
```
The numbers may vary by 1-2% depending on the server type, number of GPUs and batch size used.
## Run majority voting
```bash
ns eval \
--cluster=local \
--model=/workspace/OpenMath2-Llama3.1-8B \
--server_type=trtllm \
--output_dir=/workspace/openmath2-llama3.1-8b-eval \
--benchmarks=aime24:256,amc23:256,math:256,gsm8k:256,omni-math:256 \
--server_gpus=1 \
--num_jobs=1 \
++prompt_template=llama3-instruct \
++inference.tokens_to_generate=4096
```
This will take a very long time unless you run on slurm cluster. After the generation is done, you will be able
to see symbolic scores right away. You can evaluate with the judge by first creating new files with majority
answers. E.g. for "math" benchmark run
```bash
python -m nemo_skills.evaluation.aggregate_answers \
++input_dir="./openmath2-llama3.1-8b-eval/eval-results/math" \
++input_files="output-rs*.jsonl" \
++mode=extract \
++output_dir="./openmath2-llama3.1-8b-eval/eval-results-majority/math"
```
This will output "./openmath2-llama3.1-8b-eval/eval-results-majority/math/output-agg.jsonl" file with majority answer. We can run the llm-judge pipeline on it.
Repeat the above steps for all benchmarks. Now we are ready to run the judge pipeline and summarize results
after it is finished. You need to define `OPENAI_API_KEY` for the command below to work.
```bash
for dataset in aime24 amc23 math gsm8k omni-math; do
ns generate \
--generation_type=math_judge \
--cluster=local \
--model=gpt-4o \
--server_type=openai \
--server_address=https://api.openai.com/v1 \
--output_dir=/workspace/openmath2-llama3.1-8b-eval-judged/eval-results-majority/${dataset} \
--input_file=/workspace/openmath2-llama3.1-8b-eval/eval-results-majority/${dataset}/output-agg.jsonl
done
```
```bash
ns summarize_results /workspace/openmath2-llama3.1-8b-eval-judged/eval-results-majority --cluster local
```
This will print majority results (they will be labeled as `greedy` since we fused them into a single file).
You can also ignore the symbolic score as it's not accurate anymore after we filled majority answers.
# OpenMathInstruct-2
Using our pipelines we created [OpenMathInstruct-2 dataset](https://huggingface.co/datasets/nvidia/OpenMathInstruct-2)
which consists of 14M question-solution pairs (600K unique questions), making it nearly eight times larger
than the previous largest open-source math reasoning dataset.
The models trained on this dataset achieve strong results on common mathematical benchmarks.
<table>
<tr>
<td style="text-align: center;">model</td>
<td style="text-align: center;">GSM8K</td>
<td style="text-align: center;">MATH</td>
<td style="text-align: center;">AMC 2023</td>
<td style="text-align: center;">AIME 2024</td>
<td style="text-align: center;">Omni-MATH</td>
</tr>
<tr>
<td style="text-align: right;">Llama3.1-8B-Instruct</td>
<td style="text-align: center;">84.5</td>
<td style="text-align: center;">51.9</td>
<td style="text-align: center;">9/40</td>
<td style="text-align: center;">2/30</td>
<td style="text-align: center;">12.7</td>
</tr>
<tr>
<td style="text-align: right;">OpenMath2-Llama3.1-8B (<a href="https://huggingface.co/nvidia/OpenMath2-Llama3.1-8B-nemo">nemo</a> | <a href="https://huggingface.co/nvidia/OpenMath2-Llama3.1-8B">HF</a>)</td>
<td style="text-align: center;">91.7</td>
<td style="text-align: center;">67.8</td>
<td style="text-align: center;">16/40</td>
<td style="text-align: center;">3/30</td>
<td style="text-align: center;">22.0</td>
</tr>
<tr>
<td style="text-align: right;">+ majority@256</td>
<td style="text-align: center;">94.1</td>
<td style="text-align: center;">76.1</td>
<td style="text-align: center;">23/40</td>
<td style="text-align: center;">3/30</td>
<td style="text-align: center;">24.6</td>
</tr>
<tr>
<td style="text-align: right;">Llama3.1-70B-Instruct</td>
<td style="text-align: center;">95.1</td>
<td style="text-align: center;">68.0</td>
<td style="text-align: center;">19/40</td>
<td style="text-align: center;">6/30</td>
<td style="text-align: center;">19.0</td>
</tr>
<tr>
<td style="text-align: right;">OpenMath2-Llama3.1-70B (<a href="https://huggingface.co/nvidia/OpenMath2-Llama3.1-70B-nemo">nemo</a> | <a href="https://huggingface.co/nvidia/OpenMath2-Llama3.1-70B">HF</a>)</td>
<td style="text-align: center;">94.9</td>
<td style="text-align: center;">71.9</td>
<td style="text-align: center;">20/40</td>
<td style="text-align: center;">4/30</td>
<td style="text-align: center;">23.1</td>
</tr>
<tr>
<td style="text-align: right;">+ majority@256</td>
<td style="text-align: center;">96.0</td>
<td style="text-align: center;">79.6</td>
<td style="text-align: center;">24/40</td>
<td style="text-align: center;">6/30</td>
<td style="text-align: center;">27.6</td>
</tr>
</table>
## Paper
[OpenMathInstruct-2: Accelerating AI for Math with Massive Open-Source Instruction Data](https://arxiv.org/abs/2410.01560)
If you find our work useful, please consider citing us!
```bibtex
@inproceedings{toshniwal2024openmathinstruct2,
title = {{OpenMathInstruct-2: Accelerating AI for Math with Massive Open-Source Instruction Data}},
author = {Shubham Toshniwal and Wei Du and Ivan Moshkov and Branislav Kisacanin and Alexan Ayrapetyan and Igor Gitman},
year = {2025},
booktitle = {ICLR},
}
```
## How to reproduce our results
Browse the sections below to see all commands needed to fully reproduce our results.
Please note that unless you have an access to a large GPU cluster, it might take a very long time
for some of the commands to complete!
- [Model evaluation](evaluation.md)
- [Dataset construction](dataset.md)
- [Model training](training.md)
# Model training
We assume you have `/workspace` defined in your [cluster config](../../basics/cluster-configs.md) and are
executing all commands from that folder locally. Change all commands accordingly
if running on slurm or using different paths.
## Download data
Get the data from [HuggingFace](https://huggingface.co/datasets/nvidia/OpenMathInstruct-2).
This might take 20-30 minutes (or more depending on your network connection) and will use ~20Gb of RAM.
```python
import json
from datasets import load_dataset
from tqdm import tqdm
dataset = load_dataset('nvidia/OpenMathInstruct-2', split='train')
print("Converting dataset to jsonl format")
output_file = "openmathinstruct2.jsonl"
with open(output_file, 'w', encoding='utf-8') as f:
for item in tqdm(dataset):
f.write(json.dumps(item, ensure_ascii=False) + '\n')
print(f"Conversion complete. Output saved as {output_file}")
```
You can also download a subset of the data by using e.g. `split='train_5M'` that we used to train 70B model.
See the dataset page for more details about this.
## Convert to SFT format
Convert the data into the SFT format that NeMo-Aligner understands.
```bash
ns run_cmd --cluster=local \
python -m nemo_skills.training.prepare_data \
++prompt_template=llama3-instruct \
++prompt_config=generic/math \
++preprocessed_dataset_files=/workspace/openmathinstruct2.jsonl \
++output_key=generated_solution \
++output_path=/workspace/openmathinstruct2-sft.jsonl \
++hf_model_name="meta-llama/Meta-Llama-3.1-8B" \
++filters.drop_multi_boxed=false \
++filters.trim_prefix=false \
++filters.trim_solutions=false \
++filters.drop_incorrect_arithmetic=false \
++filters.split_arithmetic=false \
++filters.remove_contaminated=false
```
## Prepare base model
Download the base model and convert it to NeMo format.
The instructions below are for Llama3.1-8B, but the same commands should work for 70B model as well.
```bash
pip install -U "huggingface_hub[cli]"
huggingface-cli download meta-llama/Llama-3.1-8B --local-dir Llama-3.1-8B
ns convert \
--cluster=local \
--input_model=/workspace/Llama-3.1-8B \
--output_model=/workspace/llama3.1-8b-nemo \
--convert_from=hf \
--convert_to=nemo \
--model_type=llama \
--num_gpus=1 \
--hf_model_name=meta-llama/Llama-3.1-8B
```
## Run training
Run the training (assuming slurm configuration here with the same folder structure). If your cluster has strict
timeout policy, you can run multiple dependent jobs with `--num_training_jobs=N`.
```bash
ns train \
--cluster=slurm \
--expname=openmathinstruct2-repro-8b \
--output_dir=/workspace/openmathinstruct2-repro/checkpoints \
--nemo_model=/workspace/llama3.1-8b-nemo \
--num_nodes=8 \
--num_gpus=8 \
--average_steps=10000,20000,30000,40000,50000,60000 \
--training_data=/workspace/openmathinstruct2-sft.jsonl \
++model.data.train_ds.micro_batch_size=8 \
++model.tensor_model_parallel_size=4 \
++model.pipeline_model_parallel_size=1 \
++model.optim.lr=2e-5 \
++trainer.sft.save_interval=10000 \
++trainer.sft.max_steps=60000 \
++trainer.sft.max_epochs=100
```
For 70B model, we used 5M data subset and the following parameters, but training
it longer is likely going to improve results.
```bash
ns train \
--cluster=slurm \
--expname=openmathinstruct2-repro-70b \
--output_dir=/workspace/openmathinstruct2-repro-70b/checkpoints \
--nemo_model=/workspace/llama3.1-70b-nemo \
--num_nodes=32 \
--num_gpus=8 \
--average_steps=3330,6660,9990,13320,16650,20000 \
--training_data=/workspace/openmathinstruct2-sft-5M.jsonl \
++model.data.train_ds.micro_batch_size=1 \
++model.tensor_model_parallel_size=8 \
++model.pipeline_model_parallel_size=2 \
++model.optim.lr=1e-5 \
++trainer.sft.save_interval=3330 \
++trainer.sft.max_steps=20000 \
++trainer.sft.max_epochs=100
```
If you have a job timeout, it's necessary to set the maximum time per run to 40 minutes
before the timeout to allow for the final checkpoint to be saved. E.g. if your timeout is 4 hours,
add `++exp_manager.max_time_per_run=00:03:20:00`
If you want to follow up with checkpoint conversion and evaluation, see
[training docs](../../pipelines/training.md#chaining-pipelines-with-python) for an example of how to do it
through a convenient Python API.
# Model evaluation
Here are the commands you can run to reproduce our evaluation numbers.
The commands below are for [OpenMath-Nemotron-1.5B](https://huggingface.co/nvidia/OpenMath-Nemotron-1.5B) model as an example.
We assume you have `/workspace` defined in your [cluster config](../../basics/cluster-configs.md) and are
executing all commands from that folder locally. Change all commands accordingly
if running on slurm or using different paths.
!!! tip "Interactive Chat Interface"
Besides the benchmark numbers shown below, you can also interactively chat with OpenMath models using our
[chat interface](../../basics/chat_interface.md). This allows you to easily test both Chain-of-Thought (CoT) and
Tool-Integrated Reasoning (TIR) modes with code execution in a user-friendly web UI.
!!! note
For small benchmarks such as AIME24 and AIME25 (30 problems each) it is expected to see significant variation
across different evaluation reruns. We've seen the difference as large as 6% even for results that are averaged
across 64 generations. So please don't expect to see exactly the same numbers as presented in our paper, but
they should be within 3-6% of reported results.
## Download models
Get the model from HF. E.g.
```bash
pip install -U "huggingface_hub[cli]"
huggingface-cli download nvidia/OpenMath-Nemotron-1.5B --local-dir OpenMath-Nemotron-1.5B
```
## Prepare evaluation data
```bash
ns prepare_data comp-math-24-25 hle
```
## Run CoT evaluations
```bash
ns eval \
--cluster=local \
--model=/workspace/OpenMath-Nemotron-1.5B \
--server_type=trtllm \
--output_dir=/workspace/openmath-nemotron-1.5b-eval-cot \
--benchmarks=comp-math-24-25:64 \
--server_gpus=1 \
--num_jobs=1 \
++prompt_template=qwen-instruct \
++prompt_config=generic/math \
++inference.tokens_to_generate=32768 \
++inference.temperature=0.6
ns eval \
--cluster=local \
--model=/workspace/OpenMath-Nemotron-1.5B \
--server_type=trtllm \
--output_dir=/workspace/openmath-nemotron-1.5b-eval-cot \
--benchmarks=hle:64 \
--server_gpus=1 \
--num_jobs=1 \
--split=math \
++prompt_template=qwen-instruct \
++prompt_config=generic/math \
++inference.tokens_to_generate=32768 \
++inference.temperature=0.6
```
This will take a very long time unless you run on slurm cluster.
If running on slurm, you can set `--num_jobs` to a bigger number of -1 to run
each benchmark in a separate node. The number of GPUs need to match what you used
in the conversion command.
For comp-math-24-25 our symbolic checker is good enough, so we can see the results right away by running
```bash
ns summarize_results /workspace/openmath-nemotron-1.5b-eval-cot/eval-results/comp-math-24-25 --metric_type math --cluster local
```
For hle-math it's necessary to run LLM-as-a-judge step to get accurate evaluation results.
We used [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) which you
can run with the following command, assuming you have the model downloaded and converted locally
or on cluster.
```bash
ns generate \
--generation_type=math_judge \
--cluster=local \
--model=/hf_models/Qwen2.5-32B-Instruct \
--server_type=trtllm \
--server_gpus=4 \
--output_dir=/workspace/openmath-nemotron-1.5b-eval-cot/eval-results-judged/hle \
--input_dir=/workspace/openmath-nemotron-1.5b-eval-cot/eval-results/hle
```
Alternatively, you can use an API model like gpt-4o, but the results might be different.
You need to define `OPENAI_API_KEY` for the command below to work.
```bash
ns generate \
--generation_type=math_judge \
--cluster=local \
--model=gpt-4o \
--server_type=openai \
--server_address=https://api.openai.com/v1 \
--output_dir=/workspace/openmath-nemotron-1.5b-eval-cot/eval-results-judged/hle \
--input_dir=/workspace/openmath-nemotron-1.5b-eval-cot/eval-results/hle
```
To print the metrics run
```bash
ns summarize_results /workspace/openmath-nemotron-1.5b-eval-cot/eval-results-judged/hle --metric_type math --cluster local
```
This should print the metrics including both symbolic and judge evaluation.
## Run TIR evaluations
To get TIR evaluation numbers, replace the generation commands like this
```bash
ns eval \
--cluster=local \
--model=/workspace/OpenMath-Nemotron-1.5B \
--server_type=trtllm \
--output_dir=/workspace/openmath-nemotron-1.5b-eval-tir \
--benchmarks=comp-math-24-25:64 \
--server_gpus=1 \
--num_jobs=1 \
--with_sandbox \
++code_tags=openmath \
++prompt_template=qwen-instruct \
++prompt_config=openmath/tir \
++inference.tokens_to_generate=32768 \
++inference.temperature=0.6 \
++code_execution=true \
++server.code_execution.add_remaining_code_executions=true \
++total_code_executions_in_prompt=8
```
The only exception is for [OpenMath-Nemotron-14B-Kaggle](https://huggingface.co/nvidia/OpenMath-Nemotron-14B-Kaggle)
you should use the following options instead
```bash
ns eval \
--cluster=local \
--model=/workspace/openmath-nemotron-14b-kaggle-trtllm \
--server_type=trtllm \
--output_dir=/workspace/openmath-nemotron-14b-kaggle-eval-tir \
--benchmarks=comp-math-24-25:64 \
--server_gpus=1 \
--num_jobs=1 \
--with_sandbox \
++code_tags=openmath \
++prompt_template=qwen-instruct \
++prompt_config=generic/math \
++inference.tokens_to_generate=32768 \
++inference.temperature=0.6 \
++code_execution=true
```
All other commands are the same as in the [CoT part](#run-cot-evaluations).
## Run GenSelect evaluations
Here is a sample command to run GenSelect evaluation:
```bash
ns genselect \
--preprocess_args="++input_dir=/workspace/openmath-nemotron-1.5b-eval-cot/eval-results-judged/hle" \
--model=/trt_models/openmath-nemotron-1.5b \
++prompt_template=qwen-instruct \
--output_dir=/workspace/openmath-nemotron-1.5b-eval-cot/self_genselect_hle \
--cluster=local \
--server_type=trtllm \
--server_gpus=1 \
--num_random_seeds=64
```
The output folder will have three folders (apart from log folders):
1. `comparison_instances`: This is the folder where input instances for genselect are kept.
2. `comparison_judgment`: Output of GenSelect judgments.
3. `hle` / `math`: Folder with outputs based on GenSelect's judgments. If `dataset` is not specified in the command, we create a folder with the name `math`
To print the metrics run:
```bash
ns summarize_results \
/workspace/openmath-nemotron-1.5b-eval-cot/self_genselect_hle/hle \
--metric_type math \
--cluster local
```
---
date: 2025-04-23
---
# OpenMathReasoning
## OpenMathReasoning Dataset
Using our pipelines we created [OpenMathReasoning dataset](https://huggingface.co/datasets/nvidia/OpenMathReasoning).
This dataset contains
* 306K unique mathematical problems sourced from [AoPS forums](https://artofproblemsolving.com/community) with:
* 3.2M long chain-of-thought (CoT) solutions
* 1.7M long tool-integrated reasoning (TIR) solutions
* 566K samples that select the most promising solution out of many candidates (GenSelect)
* Additional 193K problems sourced from AoPS forums (problems only, no solutions)
We used [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) to preprocess problems, and
[DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) and [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) to generate solutions.
This dataset was a foundation of our winning submission to the
[AIMO-2 Kaggle competition](https://www.kaggle.com/competitions/ai-mathematical-olympiad-progress-prize-2/leaderboard).
See our [paper](https://arxiv.org/abs/2504.16891) to learn more details!
## OpenMath-Nemotron Models
To demonstrate the quality of this dataset, we release a series of OpenMath-Nemotron models trained on this data.
* [OpenMath-Nemotron-1.5B](https://huggingface.co/nvidia/OpenMath-Nemotron-1.5B)
* [OpenMath-Nemotron-7B](https://huggingface.co/nvidia/OpenMath-Nemotron-7B)
* [OpenMath-Nemotron-14B](https://huggingface.co/nvidia/OpenMath-Nemotron-14B)
* [OpenMath-Nemotron-14B-Kaggle](https://huggingface.co/nvidia/OpenMath-Nemotron-14B-Kaggle) (this is the model used in [AIMO-2 Kaggle competition](https://www.kaggle.com/competitions/ai-mathematical-olympiad-progress-prize-2/leaderboard))
* [OpenMath-Nemotron-32B](https://huggingface.co/nvidia/OpenMath-Nemotron-32B)
![Evaluation Results](./openmath-results.png)
The models achieve strong results on popular mathematical benchmarks. We present metrics as pass@1 (maj@64) where pass@1
is an average accuracy across 64 generations and maj@64 is the result of majority voting.
Please see our [paper](https://arxiv.org/abs/2504.16891) for more details on the evaluation setup.
| Model | AIME24 | AIME25 | HMMT-24-25 | HLE-Math |
| ------------------------------------------------------------------------------------------------ | ----------- | ----------- | ----------- | ----------- |
| DeepSeek-R1-Distill-Qwen-1.5B | 26.8 (60.0) | 21.4 (36.7) | 14.2 (26.5) | 2.9 (5.0) |
| [OpenMath-Nemotron-1.5B](https://huggingface.co/nvidia/OpenMath-Nemotron-1.5B) CoT | 61.6 (80.0) | 49.5 (66.7) | 39.9 (53.6) | 5.4 (5.4) |
| [OpenMath-Nemotron-1.5B](https://huggingface.co/nvidia/OpenMath-Nemotron-1.5B) TIR | 52.0 (83.3) | 39.7 (70.0) | 37.2 (60.7) | 2.5 (6.2) |
| + Self GenSelect | 83.3 | 70.0 | 62.2 | 7.9 |
| + 32B GenSelect | 83.3 | 70.0 | 62.8 | 8.3 |
| DeepSeek-R1-Distill-Qwen-7B | 54.4 (80.0) | 38.6 (53.3) | 30.6 (42.9) | 3.3 (5.2) |
| [OpenMath-Nemotron-7B](https://huggingface.co/nvidia/OpenMath-Nemotron-7B) CoT | 74.8 (80.0) | 61.2 (76.7) | 49.7 (57.7) | 6.6 (6.6) |
| [OpenMath-Nemotron-7B](https://huggingface.co/nvidia/OpenMath-Nemotron-7B) TIR | 72.9 (83.3) | 57.5 (76.7) | 54.6 (66.3) | 7.8 (10.8) |
| + Self GenSelect | 86.7 | 76.7 | 68.4 | 11.5 |
| + 32B GenSelect | 86.7 | 76.7 | 69.9 | 11.9 |
| DeepSeek-R1-Distill-Qwen-14B | 65.8 (80.0) | 48.4 (60.0) | 40.1 (52.0) | 4.2 (4.8) |
| [OpenMath-Nemotron-14B-MIX (kaggle)](https://huggingface.co/nvidia/OpenMath-Nemotron-14B-Kaggle) | 73.7 (86.7) | 57.9 (73.3) | 50.5 (64.8) | 5.7 (6.5) |
| [OpenMath-Nemotron-14B](https://huggingface.co/nvidia/OpenMath-Nemotron-14B) CoT | 76.3 (83.3) | 63.0 (76.7) | 52.1 (60.7) | 7.5 (7.6) |
| [OpenMath-Nemotron-14B](https://huggingface.co/nvidia/OpenMath-Nemotron-14B) TIR | 76.3 (86.7) | 61.3 (76.7) | 58.6 (70.9) | 9.5 (11.5) |
| + Self GenSelect | 86.7 | 76.7 | 72.4 | 14.1 |
| + 32B GenSelect | 90.0 | 76.7 | 71.9 | 13.7 |
| QwQ-32B | 78.1 (86.7) | 66.5 (76.7) | 55.9 (63.3) | 9.0 (9.5) |
| DeepSeek-R1-Distill-Qwen-32B | 66.9 (83.3) | 51.8 (73.3) | 39.9 (51.0) | 4.8 (6.0) |
| [OpenMath-Nemotron-32B](https://huggingface.co/nvidia/OpenMath-Nemotron-32B) CoT | 76.5 (86.7) | 62.5 (73.3) | 53.0 (59.2) | 8.3 (8.3) |
| [OpenMath-Nemotron-32B](https://huggingface.co/nvidia/OpenMath-Nemotron-32B) TIR | 78.4 (93.3) | 64.2 (76.7) | 59.7 (70.9) | 9.2 (12.5) |
| + Self GenSelect | 93.3 | 80.0 | 73.5 | 15.7 |
| DeepSeek-R1 | 79.1 (86.7) | 64.3 (73.3) | 53.0 (59.2) | 10.5 (11.4) |
## Paper
[AIMO-2 Winning Solution: Building State-of-the-Art Mathematical Reasoning Models with OpenMathReasoning dataset](https://arxiv.org/abs/2504.16891)
If you find our work useful, please consider citing us!
```bibtex
@article{moshkov2025aimo2,
title = {{AIMO-2 Winning Solution: Building State-of-the-Art Mathematical Reasoning Models with OpenMathReasoning dataset}},
author = {Ivan Moshkov and Darragh Hanley and Ivan Sorokin and Shubham Toshniwal and Christof Henkel and Benedikt Schifferer and Wei Du and Igor Gitman},
year = {2025},
journal = {arXiv preprint arXiv:2504.16891}
}
```
## How to reproduce our results
Browse the sections below to see all commands needed to fully reproduce our results.
Please note that unless you have an access to a large GPU cluster, it might take a very long time
for some of the commands to complete!
- [Model evaluation](evaluation.md)
- [Dataset construction](dataset.md)
- [Model training](training.md)
# Dataset construction
!!! note
This page has instructions for how to re-generate datasets from scratch. If you just want to download existing
data that we released, you can use the scripts in the [training documentation](./training.md#download-data-and-convert-to-sft-format).
Here are the commands you can run to re-create our synthetic dataset.
We assume you have `/workspace` defined in your [cluster config](../../basics/cluster-configs.md) and are
running commands with a Slurm config. Change all commands accordingly if running locally or using different paths.
## Math data
### Solution generation
We use problems from [OpenMathReasoning](https://huggingface.co/datasets/nvidia/OpenMathReasoning) dataset. So first,
download them using this Python snippet and put inside `/workspace/open-reasoning/sdg` on your Slurm cluster.
We found that the quality of converted proof problems is not high, so we are excluding them here.
```python
from datasets import concatenate_datasets, load_dataset
def remove_proofs(example):
return example['problem_type'] != 'converted_proof'
dataset = load_dataset("nvidia/OpenMathReasoning")
dataset['cot'] = dataset['cot'].remove_columns(['generation_model', 'generated_solution', 'inference_mode', 'used_in_kaggle'])
dataset['additional_problems'] = dataset['additional_problems'].remove_columns(['generation_model', 'generated_solution', 'inference_mode', 'used_in_kaggle'])
full_data = concatenate_datasets([dataset['cot'], dataset['additional_problems']])
full_data = full_data.filter(remove_proofs, num_proc=20)
full_data.to_json("math-problems.jsonl")
```
Next, prepare the [DeepSeek-R1-0528](https://huggingface.co/deepseek-ai/DeepSeek-R1-0528) to run on Slurm.
Here we assume that model is hosted on 16 H100 GPUs, but other GPU configurations are possible with corresponding
modifications to commands.
To download the model you can run the following from `/workspace` folder on Slurm.
We will also need [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) to use as the judge
for answer correctness.
```bash
huggingface-cli download deepseek-ai/DeepSeek-R1-0528 --local-dir DeepSeek-R1-0528
huggingface-cli download Qwen/Qwen2.5-32B-Instruct --local-dir Qwen2.5-32B-Instruct
```
The next step is optional, but we recommend sharding the checkpoint to avoid very long loading time.
```python
from nemo_skills.pipeline.cli import run_cmd, wrap_arguments
cmd = (
"python3 nemo_skills/conversion/save_sharded_state.py "
" --model-path=/workspace/DeepSeek-R1-0528 "
" --output=/workspace/DeepSeek-R1-0528-tp16 "
" --tensor-parallel-size=16 "
" --context-len=8192 "
" --trust-remote-code "
" --nnodes 2 "
" --dist-init-addr $SLURM_MASTER_NODE:20000 "
" --node-rank $SLURM_PROCID "
)
run_cmd(
ctx=wrap_arguments(cmd),
cluster="slurm",
num_gpus=8,
num_nodes=2,
container="sglang",
log_dir="/workspace/DeepSeek-R1-0528-tp16",
)
```
Finally, launch the data generation command. You can adjust `num_chunks` (how many jobs to launch in parallel) and
`dependent_jobs` (how many jobs to launch sequentially in case there is a fixed timeout on cluster) to fit your setup.
```python
from nemo_skills.pipeline.cli import generate, run_cmd, wrap_arguments
cluster = 'slurm'
tokens_to_generate = 32768
num_solutions = 16
# Main generation - this will take a lot of time and GPUs!
# You can select a subset of data to run on if you want to test things
generate(
ctx=wrap_arguments(
f"++prompt_config=generic/math "
f"++inference.temperature=0.6 "
f"++inference.tokens_to_generate={tokens_to_generate} "
),
cluster=cluster,
input_file="/workspace/open-reasoning/sdg/math-problems.jsonl",
output_dir="/workspace/open-reasoning/sdg/solutions",
expname="r1-0528-math-solutions",
model="/workspace/DeepSeek-R1-0528-tp16",
server_type="sglang",
server_gpus=8,
server_nodes=2,
server_args=f"--load-format sharded_state --context-length {tokens_to_generate + 2000}",
num_random_seeds=num_solutions,
# set these according to your cluster configuration
# num_chunks=N,
# dependent_jobs=M,
)
# Judge step, this one is very fast as it just compares the predicted
# and expected answers for each solution, doesn't check reasoning
generate(
ctx=wrap_arguments(""),
cluster=cluster,
generation_type="math_judge",
input_dir=f"/workspace/open-reasoning/sdg/solutions",
output_dir=f"/workspace/open-reasoning/sdg/solutions-judged",
expname="r1-0528-math-solutions-judge",
run_after="r1-0528-math-solutions",
model="/workspace/Qwen2.5-32B-Instruct",
server_type="sglang",
server_gpus=8,
num_random_seeds=num_solutions,
)
# We then change all "expected_answer" values to the majority
# from R1 if there is not a single match. While there are some really
# hard problems for which this will not be correct, we found that
# in most cases when R1 is not able to match GT answer even one time,
# the GT answer itself is not correct.
run_cmd(
ctx=wrap_arguments(
"python /nemo_run/code/recipes/openreasoning/scripts/use_majority_if_no_answer.py "
" /workspace/open-reasoning/sdg/solutions-judged "
" /workspace/open-reasoning/sdg/maj-if-no-correct "
),
cluster=cluster,
expname="change-to-majority-if-no-correct",
run_after="r1-0528-math-solutions-judge",
log_dir="/workspace/open-reasoning/sdg/maj-if-no-correct",
)
# Next we re-judge the data to keep matches with the new majority answer
# (should cover non-string match cases like 0.5 vs 1/2)
generate(
ctx=wrap_arguments(""),
cluster=cluster,
generation_type="math_judge",
input_dir=f"/workspace/open-reasoning/sdg/maj-if-no-correct",
output_dir=f"/workspace/open-reasoning/sdg/maj-if-no-correct-judged",
expname="r1-0528-math-solutions-judge-after-majority",
run_after="change-to-majority-if-no-correct",
model="/workspace/Qwen2.5-32B-Instruct",
server_type="sglang",
server_gpus=8,
num_random_seeds=num_solutions,
)
# As the final step we convert this data to the format that can be used for SFT.
# This script will also filter anything not judged as correct
cmd = (
"python -m nemo_skills.training.prepare_data "
" ++prompt_template=qwen-instruct "
" ++prompt_config=generic/math "
" ++input_files='/workspace/open-reasoning/sdg/maj-if-no-correct-judged/output-rs*.jsonl' "
" ++output_path=/workspace/open-reasoning/sft-data-math.jsonl "
" ++filters.drop_multi_boxed=false "
" ++filters.trim_prefix=false "
" ++filters.remove_no_think_tags=true "
" ++filters.remove_contaminated=false " # OpenMathReasoning is already decontaminated
" ++filters.remove_len_outlier_solutions=false "
" ++filters.remove_len_outlier_problems=false "
" ++use_judgement=true "
)
run_cmd(
ctx=wrap_arguments(cmd),
cluster=cluster,
log_dir="/workspace/open-reasoning/sft-data-math-logs",
expname='prepare-for-sft-math',
run_after="r1-0528-math-solutions-judge-after-majority",
)
```
The final data that's ready for training will then be available in `/workspace/open-reasoning/sft-data-math.jsonl`.
### GenSelect data
Coming soon!
## Code data
The code data was creating with exactly the same pipeline as used for [OpenCodeReasoning dataset](../opencodereasoning/dataset.md),
except the solutions are generated with [DeepSeek-R1-0528](https://huggingface.co/deepseek-ai/DeepSeek-R1-0528).
## Science data
We generate science problems using [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) and [Qwen3-235B-A22B](https://huggingface.co/Qwen/Qwen3-235B-A22B) LLMs with the [prompt for science question generation](https://github.com/NVIDIA/NeMo-Skills/tree/main/recipes/openreasoning/prompts/science_question_generation_prompt.yaml), using few-shot examples to demonstrate the format.
Questions are generated based on difficulty level, topic, and subtopic.
Full dataset used for this effort is available at [HuggingFace](https://huggingface.co/datasets/nvidia/OpenScience).
Note: HuggingFace version includes questions generated with [Qwen2.5-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-72B-Instruct), which are not used for OpenReasoning.
The next step is to augment these problems using the [prompt for science question augmentation](https://github.com/NVIDIA/NeMo-Skills/tree/main/recipes/openreasoning/prompts/science_question_augmentation_prompt.yaml), with few-shot examples to demonstrate the format of the output.
Next, we generate solutions for these problems.
We use [DeepSeek-R1-0528](https://huggingface.co/deepseek-ai/DeepSeek-R1-0528) to generate solutions with parameters as described in the math section above.
The final step is to apply majority voting over the solutions generated in the previous step to obtain the final dataset.
The resulting dataset, OpenScienceReasoning-2, is available for download on Hugging Face [here](https://huggingface.co/datasets/nvidia/OpenScienceReasoning-2).
\ No newline at end of file
# Model evaluation
Here are the commands you can run to reproduce our evaluation numbers.
We assume you have `/workspace` defined in your [cluster config](../../basics/cluster-configs.md) and are
executing all commands from that folder locally. Change all commands accordingly
if running on slurm or using different paths.
## Download models
Get the models from HF. E.g.
```bash
huggingface-cli download nvidia/OpenReasoning-Nemotron-1.5B --local-dir OpenReasoning-Nemotron-1.5B
```
To evaluate HLE we used [Qwen2.5-32B-Instruct](https://huggingface.co/Qwen/Qwen2.5-32B-Instruct) model as a judge.
You will need to download it as well if you want to reproduce HLE numbers
```bash
huggingface-cli download Qwen/Qwen2.5-32B-Instruct --local-dir Qwen2.5-32B-Instruct
```
## Prepare evaluation data
```bash
ns prepare_data aai aime24 aime25 hmmt_feb25 brumo25 livecodebench gpqa mmlu-pro hle
```
## Run evaluation
!!! note
The current script only supports GenSelect evaluation for math benchmarks.
We will add instructions and commands for GenSelect for code and science in the next few days.
We provide an evaluation script in [recipes/openreasoning/eval.py](https://github.com/NVIDIA/NeMo-Skills/tree/main/recipes/openreasoning/eval.py).
It will run evaluation on all benchmarks and for all 4 model sizes. You can modify it directly to change evaluation settings
or to only evaluate a subset of models / benchmarks.
After the evaluation is finished, you can find `metrics.json` files in each benchmark folders with full scores.
To view GenSelect scores additionally run the following commands for each benchmark and model size. E.g. for 14B and `hmmt_feb25` benchmark, run
```bash
ns summarize_results /workspace/open-reasoning-evals/14B-genselect/hmmt_feb25/math/ --metric_type math
```
which should print the following scores. Here `majority@64` is the number we are looking for.
Note that this is majority across GenSelect runs, not original generations.
```bash
----------------------------------- math ----------------------------------
evaluation_mode | num_entries | avg_tokens | symbolic_correct | no_answer
pass@1[avg-of-64] | 30 | 16066 | 85.78% | 0.21%
majority@64 | 30 | 16066 | 93.33% | 0.00%
pass@64 | 30 | 16066 | 96.67% | 0.00%
```
\ No newline at end of file
---
date: 2025-07-18
---
# OpenReasoning
We released OpenReasoning-Nemotrons: a suite of reasoning-capable large language models (LLMs) which have been distilled from the DeepSeek R1 0528 671B model. Trained on a massive, high-quality dataset distilled from the new DeepSeek R1 0528, our new 7B, 14B, and 32B models achieve state-of-the-art performance on a wide range of reasoning benchmarks for their respective sizes in the domain of mathematics, science and code.
The models are available to download from **Hugging Face** ([1.5B](https://huggingface.co/nvidia/OpenReasoning-Nemotron-1.5B), [7B](https://huggingface.co/nvidia/OpenReasoning-Nemotron-7B), [14B](https://huggingface.co/nvidia/OpenReasoning-Nemotron-14B), [32B](https://huggingface.co/nvidia/OpenReasoning-Nemotron-32B)).
The foundation of these models is their dataset. We generated **5 million high-quality reasoning-based solutions** by leveraging the powerful DeepSeek R1 0528 model across the domains of mathematics, coding, and science. This dataset will be released in the coming months, enabling all models to improve their reasoning capabilities on these domains.
## Evaluation results
![Evaluation Results with pass@1](./pass-1.png)
Our models demonstrate exceptional performance across a suite of challenging reasoning benchmarks. The 7B, 14B, and 32B models consistently set new state-of-the-art records for their size classes.
| **Model** | **AritificalAnalysisIndex*** | **GPQA** | **MMLU-PRO** | **HLE** | **LiveCodeBench*** | **SciCode** | **AIME24** | **AIME25** | **HMMT FEB 25** |
| :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- | :--- |
| **1.5B**| 31.0 | 31.6 | 47.5 | 5.5 | 28.6 | 2.2 | 55.5 | 45.6 | 31.5 |
| **7B** | 54.7 | 61.1 | 71.9 | 8.3 | 63.3 | 16.2 | 84.7 | 78.2 | 63.5 |
| **14B** | 60.9 | 71.6 | 77.5 | 10.1 | 67.8 | 23.5 | 87.8 | 82.0 | 71.2 |
| **32B** | 64.3 | 73.1 | 80.0 | 11.9 | 70.2 | 28.5 | 89.2 | 84.0 | 73.8 |
\* This is our estimation of the Artificial Analysis Intelligence Index, not an official score.
\* LiveCodeBench version 6, date range 2408-2505.
## Combining the work of multiple agents
OpenReasoning-Nemotron models can be used in a "heavy" mode by starting multiple parallel generations and combining them together via [generative solution selection (GenSelect)](https://arxiv.org/abs/2504.16891). To add this "skill" we follow the original GenSelect training pipeline except we do not train on the selection summary but use the full reasoning trace of DeepSeek R1 0528 671B instead. We only train models to select the best solution for math problems but surprisingly find that this capability directly generalizes to code and science questions! With this "heavy" GenSelect inference mode, OpenReasoning-Nemotron-32B model surpasses O3 (High) on math and coding benchmarks.
![Evaluation Results with GenSelect](./genselect.png)
| **Model** | **Pass@1 (Avg@64)** | **Majority@64** | **GenSelect** |
| :--- | :--- | :--- | :--- |
| **1.5B** | | | |
| **AIME24** | 55.5 | 76.7 | 76.7 |
| **AIME25** | 45.6 | 70.0 | 70.0 |
| **HMMT Feb 25** | 31.5 | 46.7 | 53.3 |
| **7B** | | | |
| **AIME24** | 84.7 | 93.3 | 93.3 |
| **AIME25** | 78.2 | 86.7 | 93.3 |
| **HMMT Feb 25** | 63.5 | 83.3 | 90.0 |
| **LCB v6 2408-2505** | 63.4 | n/a | 67.7 |
| **14B** | | | |
| **AIME24** | 87.8 | 93.3 | 93.3 |
| **AIME25** | 82.0 | 90.0 | 90.0 |
| **HMMT Feb 25** | 71.2 | 86.7 | 93.3 |
| **LCB v6 2408-2505** | 67.9 | n/a | 69.1 |
| **32B** | | | |
| **AIME24** | 89.2 | 93.3 | 93.3 |
| **AIME25** | 84.0 | 90.0 | 93.3 |
| **HMMT Feb 25** | 73.8 | 86.7 | 96.7 |
| **LCB v6 2408-2505** | 70.2 | n/a | 75.3 |
| **HLE** | 11.8 | 13.4 | 15.5 |
## How to reproduce our results
Browse the sections below to see all commands needed to fully reproduce our results.
Please note that unless you have an access to a large GPU cluster, it might take a very long time
for some of the commands to complete!
- [Model evaluation](evaluation.md)
- [Dataset construction](dataset.md)
- [Model training](training.md)
# Model training
## Download data and convert to SFT format
OpenReasoning dataset consists of 5 independent parts:
* Math CoT data
* Math TIR data
* Math GenSelect data
* Code CoT data
* Science CoT data
All datasets except GenSelect are now released. You can use code snippets below to download them and prepare for SFT.
For final training dataset, you should concatenate all of the data together.
### Math CoT data
Math CoT data is released as part of the [nvidia/Nemotron-Post-Training-Dataset-v1](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v1) dataset.
```python
from functools import partial
from datasets import load_dataset
from nemo_skills.prompt.utils import get_prompt
def apply_format(elem, prompt):
assert len(elem['messages']) == 2
elem['input'] = prompt.fill({'problem': elem['messages'][0]['content']})
elem['output'] = elem['messages'][1]['content'] + prompt.config.template.assistant_end
return elem
dataset = load_dataset("nvidia/Nemotron-Post-Training-Dataset-v1", split="math")
prompt = get_prompt('generic/math', 'qwen-instruct')
func = partial(apply_format, prompt=prompt)
dataset = dataset.map(func, num_proc=20)
dataset = dataset.remove_columns(['messages'])
dataset.to_json("open-reasoning-math-cot.jsonl")
```
### Math TIR data
We re-use math TIR data from [nvidia/OpenMathReasoning](https://huggingface.co/datasets/nvidia/OpenMathReasoning) dataset.
While we included this data in training and our released models are capable of TIR inference, we found that results are
generally worse than using CoT. To fix this, TIR data would need to be re-generated using newer models, but this is not
done in our current release.
To get this data, follow instructions for the **second-round** SFT data in [OpenMathReasoning documentation](../openmathreasoning/training.md#second-round-sft).
### Math GenSelect data
Coming soon!
### Code CoT data
Code CoT data is released as part of the [nvidia/Nemotron-Post-Training-Dataset-v1](https://huggingface.co/datasets/nvidia/Nemotron-Post-Training-Dataset-v1) dataset.
```python
import json
from functools import partial
from datasets import load_dataset
from nemo_skills.prompt.utils import get_prompt
question_datasets = {
"taco": load_dataset("BAAI/TACO"),
"apps": load_dataset("codeparrot/apps"),
"code_contests": load_dataset("deepmind/code_contests"),
"open-r1/codeforces": load_dataset("open-r1/codeforces")
}
def get_question(ds_name, split, index):
benchmark = question_datasets[ds_name][split][int(index)]
if ds_name == "code_contests":
return benchmark["description"]
elif ds_name in ["taco", "apps"]:
return benchmark["question"]
elif ds_name == "open-r1/codeforces":
question = benchmark["description"]
if benchmark["input_format"]:
question += "\n\nInput\n\n" + benchmark["input_format"]
if benchmark["output_format"]:
question += "\n\nOutput\n\n" + benchmark["output_format"]
if benchmark["examples"]:
question += "\n\nExamples"
for example in benchmark["examples"]:
if "input" in example:
question += "\n\nInput\n\n" + example["input"]
if "output" in example:
question += "\n\nOutput\n\n" + example["output"]
if benchmark["note"]:
question += "\n\nNote\n\n" + benchmark["note"]
return question
else:
raise RuntimeError("Something wrong with the data!")
def apply_format(elem, prompt):
metadata = json.loads(elem['metadata'])
question = get_question(metadata['dataset'], metadata['split'], int(metadata['index']))
elem['input'] = prompt.fill({'question': question})
elem['output'] = elem['messages'][1]['content'] + prompt.config.template.assistant_end
return elem
dataset = load_dataset("nvidia/Nemotron-Post-Training-Dataset-v1", split="code")
prompt = get_prompt('eval/livecodebench/python_codegen_reasoning', 'qwen-instruct')
func = partial(apply_format, prompt=prompt)
dataset = dataset.map(func, num_proc=20)
dataset = dataset.remove_columns(['messages'])
dataset.to_json("open-reasoning-code-cot.jsonl")
```
### Science CoT data
Science CoT data is released as [nvidia/OpenScienceReasoning-2](https://huggingface.co/datasets/nvidia/OpenScienceReasoning-2) dataset.
```python
from functools import partial
from datasets import load_dataset
from nemo_skills.prompt.utils import get_prompt
def apply_format(elem, prompt):
elem['input'] = prompt.fill({'question': elem['input']})
elem['output'] += prompt.config.template.assistant_end
return elem
dataset = load_dataset("nvidia/OpenScienceReasoning-2", split="train")
prompt = get_prompt('generic/default', 'qwen-instruct') # data already includes instruction
func = partial(apply_format, prompt=prompt)
dataset = dataset.map(func, num_proc=20)
dataset.to_json("open-reasoning-science-cot.jsonl")
```
## Train the models
We mostly use the same training commands as for [OpenMathReasoning models](../openmathreasoning/training.md#run-training). The only difference
is that we pack sequences to 49152 length and use a little different hyperparameters detailed in the following table.
| | **lr** | **min_lr** | **TP** | **PP** | **CP** |
| --------------------- | ------ | ---------- | ------ | ------ | ------ |
| **Qwen2.5-Math-1.5B** | 1e-4 | 1e-7 | 1 | 1 | 4 |
| **Qwen2.5-Math-7B** | 1e-4 | 1e-7 | 4 | 1 | 4 |
| **Qwen2.5-14B** | 1e-4 | 1e-7 | 8 | 1 | 4 |
| **Qwen2.5-32B** | 1e-4 | 1e-7 | 8 | 2 | 4 |
All models are trained for 30000 steps with a single round of SFT and we take the last checkpoint as the final model.
\ No newline at end of file
---
title: Tutorials
hide:
- toc
---
site_name: NeMo-Skills
site_url: https://nvidia.github.io/NeMo-Skills
extra_css:
- css/extra.css
plugins:
- blog:
blog_dir: tutorials
post_dir: tutorials/posts
- redirects:
redirect_maps:
'openmathinstruct2/index.md': 'releases/openmathinstruct2/index.md'
'openmathreasoning1/index.md': 'releases/openmathreasoning/index.md'
theme:
name: material
logo: favicon.ico
favicon: favicon.ico
palette:
primary: blue grey
features:
- content.code.copy
- content.code.annotate
- navigation.instant
- navigation.instant.progress
- navigation.tabs
- navigation.tabs.sticky
- navigation.indexes
- toc.follow
markdown_extensions:
- meta
- pymdownx.highlight:
anchor_linenums: true
line_spans: __span
pygments_lang_class: true
- pymdownx.inlinehilite
- pymdownx.snippets
- pymdownx.superfences
- pymdownx.tabbed:
alternate_style: true
slugify: !!python/object/apply:pymdownx.slugs.slugify
kwds:
case: lower
- admonition
- pymdownx.details
- pymdownx.superfences
- toc:
permalink: true
- attr_list
- pymdownx.emoji:
emoji_index: !!python/name:material.extensions.emoji.twemoji
emoji_generator: !!python/name:material.extensions.emoji.to_svg
- footnotes
nav:
- NeMo-Skills: index.md
- Getting started:
- basics/index.md
- Cluster configs: basics/cluster-configs.md
- Code packaging: basics/code-packaging.md
- Prompt format: basics/prompt-format.md
- Inference: basics/inference.md
- Chat Interface: basics/chat_interface.md
- Sandbox for code execution: basics/sandbox.md
- Pipelines:
- pipelines/index.md
- Generation: pipelines/generation.md
- Evaluation: pipelines/evaluation.md
- Checkpoint conversion: pipelines/checkpoint-conversion.md
- LLM-as-a-judge: pipelines/llm-as-a-judge.md
- Decontamination: pipelines/decontamination.md
- Training (NeMo-Aligner): pipelines/training.md
- Training (verl, OpenRLHF): pipelines/training-verl-openrlhf.md
- Arbitrary commands: pipelines/run-cmd.md
- Tutorials:
- tutorials/index.md
- Papers & Releases:
- releases/index.md
- OpenReasoning:
- releases/openreasoning/index.md
- Model Evaluation: releases/openreasoning/evaluation.md
- Dataset construction: releases/openreasoning/dataset.md
- Model training: releases/openreasoning/training.md
- OpenCodeReasoning:
- releases/opencodereasoning/index.md
- Model Evaluation: releases/opencodereasoning/evaluation.md
- Dataset construction: releases/opencodereasoning/dataset.md
- OpenMathReasoning:
- releases/openmathreasoning/index.md
- Model Evaluation: releases/openmathreasoning/evaluation.md
- Dataset construction: releases/openmathreasoning/dataset.md
- Model training: releases/openmathreasoning/training.md
- OpenMathInstruct-2:
- releases/openmathinstruct2/index.md
- Model Evaluation: releases/openmathinstruct2/evaluation.md
- Dataset construction: releases/openmathinstruct2/dataset.md
- Model training: releases/openmathinstruct2/training.md
\ No newline at end of file
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from nemo_skills.version import __version__
# only used in ns setup command to initialize with defaults
_containers = {
'trtllm': 'igitman/nemo-skills-trtllm:0.6.1',
'vllm': 'igitman/nemo-skills-vllm:0.6.1',
'sglang': 'igitman/nemo-skills-sglang:0.6.1',
'nemo': 'igitman/nemo-skills-nemo:0.6.1',
'megatron': 'igitman/nemo-skills-megatron:0.6.1',
'sandbox': 'igitman/nemo-skills-sandbox:0.6.1',
'nemo-skills': 'igitman/nemo-skills:0.6.1',
'verl': 'igitman/nemo-skills-verl:0.6.1',
'nemo-rl': 'igitman/nemo-skills-nemo-rl:0.6.1',
}
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from nemo_skills.code_execution.utils import extract_code_output, extract_code_to_execute, format_code_output
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import multiprocessing
import os
import resource
import subprocess
import sys
import tempfile
import signal
from io import StringIO
from flask import Flask, request
app = Flask(__name__)
MEM_LIMIT_BYTES = int(os.environ.get('NEMO_SKILLS_SANDBOX_MEM_LIMIT', 10 * 1024 ** 3)) # 10 GiB default
def set_limits(mem_bytes: int = MEM_LIMIT_BYTES) -> None:
"""
Apply RLIMITs and start a new session for the child process.
Called via `preexec_fn` (subprocess) or directly in a forked worker.
"""
resource.setrlimit(resource.RLIMIT_AS, (mem_bytes, mem_bytes))
resource.setrlimit(resource.RLIMIT_DATA, (mem_bytes, mem_bytes))
os.setsid() # isolate PGID / signals
def execute_ipython(generated_code, timeout):
# running in a separate process to ensure any kind of crashes are properly handled
queue = multiprocessing.Queue()
process = multiprocessing.Process(target=execute_code_subprocess, args=(generated_code, queue))
process.start()
process.join(timeout=timeout)
if process.is_alive(): # didn't finish successfully
process.kill()
return {"process_status": "timeout", "stdout": "", "stderr": "Timed out\n"}
return queue.get()
def execute_python(generated_code, std_input, timeout, language):
execution_command = [language, "-c", generated_code]
try:
process = subprocess.Popen(
execution_command,
stdin=subprocess.PIPE,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
universal_newlines=True,
preexec_fn=set_limits,
)
stdout, stderr = process.communicate(input=std_input, timeout=timeout)
return {"process_status": "completed", "stdout": stdout, "stderr": stderr}
except subprocess.TimeoutExpired:
try:
# kill the whole process group
os.killpg(process.pid, signal.SIGKILL)
except ProcessLookupError:
pass
process.wait(timeout=1) # reap, no extra timeout needed
return {"process_status": "timeout", "stdout": "", "stderr": "Timed out\n"}
def execute_lean4(generated_code, timeout):
temp_file_name = None
try:
project_path = "/lean4/my_project"
with tempfile.NamedTemporaryFile(dir=project_path, delete=False, suffix=".lean") as temp_file:
temp_file_name = temp_file.name
temp_file.write(generated_code.encode('utf-8'))
result = subprocess.run(
['lake', 'env', '--dir', project_path, 'lean', temp_file_name],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
timeout=timeout,
cwd=project_path, # Ensure we are in the correct working directory
)
if result.returncode == 0:
process_status = "completed"
else:
process_status = "failed"
return {
"process_status": process_status,
"stdout": result.stdout.decode('utf-8'),
"stderr": result.stderr.decode('utf-8'),
}
except subprocess.TimeoutExpired:
return {"process_status": "timeout", "stdout": "", "stderr": "Timed out\n"}
except Exception as e:
print(f"Error: {str(e)}")
return {"process_status": "error", "stdout": "", "stderr": str(e) + "\n"}
finally:
# Safely remove the temporary file if it was created
if temp_file_name and os.path.exists(temp_file_name):
os.remove(temp_file_name)
# need to memory-limit to avoid common errors of allocating too much
# but this has to be done in a subprocess to not crush server itself
def execute_code_subprocess(generated_code, queue):
# this can be overriden inside generated code, so it's not a guaranteed protection
set_limits()
sys.stdout = StringIO()
try:
exec(generated_code, {})
queue.put(sys.stdout.getvalue())
except Exception as e:
print(f"Error: {str(e)}")
queue.put({"process_status": "error", "stdout": "", "stderr": str(e) + "\n"})
# Main Flask endpoint to handle execution requests
@app.route("/execute", methods=["POST"])
def execute():
generated_code = request.json['generated_code']
timeout = request.json['timeout']
language = request.json.get('language', 'ipython')
std_input = request.json.get('std_input', '')
if language == 'ipython':
return execute_ipython(generated_code, timeout)
elif language == 'lean4':
return execute_lean4(generated_code, timeout)
else:
return execute_python(generated_code, std_input, timeout, language)
if __name__ == '__main__':
log = logging.getLogger('werkzeug')
log.setLevel(logging.WARNING)
app.run(port=6000)
#!/bin/bash
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: needs to run from the root of the repo!
SANDBOX_NAME=${1:-'local-sandbox'}
docker build --tag=${SANDBOX_NAME} --build-arg="UWSGI_PROCESSES=$((nproc --all * 10))" --build-arg="UWSGI_CHEAPER=nproc --all" -f dockerfiles/Dockerfile.sandbox .
docker run --network=host --rm --memory=${NEMO_SKILLS_SANDBOX_MEM_LIMIT:-"16g"} --restart unless-stopped --name=local-sandbox ${SANDBOX_NAME}
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import re
from typing import Tuple
from nemo_skills.utils import get_logger_name
LOG = logging.getLogger(get_logger_name(__file__))
def format_code_output(
execution_dict,
code_output_begin: str,
code_output_end: str,
code_output_format: str = 'llama',
remaining_code_executions: int | None = None,
):
"""Formatting code output to be displayed as an llm expects it."""
remaining_ce_string = ""
if remaining_code_executions is not None:
if remaining_code_executions > 0:
remaining_ce_string = (
f"```system\n"
f"Remaining code executions: {remaining_code_executions}. "
f"You will not be able to call code when you run out of executions, so use it wisely. "
f"Note that you can still continue solving the problem without code after that.\n"
f"```\n"
)
else:
remaining_ce_string = (
f"```system\n"
f"You have run out of code executions! You can no longer write or execute code. "
f"Now you should continue solving the problem by relying on your mathematical reasoning and analytical skills.\n"
f"```\n"
)
if code_output_format == 'llama':
output = execution_dict["process_status"]
if execution_dict['stdout']:
output += f"\n[stdout]\n{execution_dict['stdout']}[/stdout]"
if execution_dict['stderr']:
output += f"\n[stderr]\n{execution_dict['stderr']}[/stderr]"
output = f"{code_output_begin}\n\n{output}{remaining_ce_string}{code_output_end}\n\n"
elif code_output_format == 'qwen':
output = ""
if execution_dict['stdout']:
output += f"{execution_dict['stdout']}"
if execution_dict['stderr']:
output += f"{execution_dict['stderr']}"
if execution_dict['stderr'] and execution_dict['stdout']:
LOG.warning("Both stdout and stderr are not empty. This shouldn't normally happen! %s", execution_dict)
output = f"{code_output_begin}{output}{code_output_end}{remaining_ce_string}"
else:
raise ValueError(f"Unknown code_output_format: {code_output_format}")
# wrapping with code output separators
return output
def _extract_between_separators(generation: str, separators: Tuple[str, str], extract_all: bool = False):
"""Extracting all text between last occurrence of separators[0] and [1].
If extract_all is True, returning a list with all occurrences of text between separators.
"""
if extract_all:
separators = [re.escape(sp) for sp in separators]
pattern = f'{separators[0]}(.*?){separators[1]}'
return re.findall(pattern, generation, re.DOTALL)
return generation.split(separators[0])[-1].split(separators[1])[0]
def extract_code_to_execute(generation: str, code_begin: str, code_end: str, extract_all: bool = False):
return _extract_between_separators(generation, [code_begin, code_end], extract_all)
def extract_code_output(generation: str, code_output_begin: str, code_output_end: str, extract_all: bool = False):
return _extract_between_separators(generation, [code_output_begin, code_output_end], extract_all)
def extract_code_block(text: str, languages=None) -> str:
if languages is None:
languages = [""]
for language in languages:
match = re.search(rf"```{language}\s*\n?(.*?)\n?```", text, re.DOTALL)
if match:
return match.group(1).strip()
return ""
def clean_formal_generation(generation: str, final_answer_key: str = "**FINAL ANSWER**") -> str:
# Extract part after **FINAL ANSWER** if present
if final_answer_key in generation:
generation = generation.split(final_answer_key, 1)[1].strip()
languages = ["lean4", "lean3", "lean", ""]
extracted_code = extract_code_block(generation, languages)
if extracted_code:
return extracted_code
# If no explicit code block, remove any surrounding triple backticks
return re.sub(r"^\s*```(?:lean4|lean3|lean)?\s*|\s*```[\s]*$", "", generation).strip()
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed. Click to expand it.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment