Commit 09dfbc06 by nanziyuan

first commit

parents
__pycache__/*
run.py
#!/bin/bash
#- Job parameters
# (TODO)
# Please modify job name
#SBATCH -J eval-{name} # The job name
#SBATCH -o ret-{name}-%j.out # Write the standard output to file named 'ret-<job_number>.out'
#SBATCH -e ret-{name}-%j.err # Write the standard error to file named 'ret-<job_number>.err'
#- Needed resources
# (TODO)
# Please modify your requirements
#SBATCH -p cpu # Submit to 'nv-gpu' and 'nv-gpu-hw' Partitiion
#SBATCH -t 1-12:00:00 # Run for a maximum time of 0 days, 8 hours, 00 mins, 00 secs
#SBATCH --nodes=1 # Request N nodes
#SBATCH --ntasks-per-node=1 # Request N*M tasks
#SBATCH --cpus-per-task=32 # Request N*M*K cores
#SBATCH --qos=cpu-normal # Request QOS Type
#- Operstions
echo "Job start at $(date "+%Y-%m-%d %H:%M:%S")"
echo "Job run at:"
echo "$(hostnamectl)"
#- CPU and Mem
lscpu
free -h -w
#- Load environments
source /tools/module_env.sh
module list # list modules loaded by default
#- tools
module load cluster-tools/v1.0
#- language
##- virtualenv
/lustre/S/nanziyuan/miniconda3/bin/activate srasm
#- Log information
echo $(module list) # list modules loaded
echo $(which gcc)
echo $(which python)
echo $(which python3)
cluster-quota # nas quota
#- Job step
python /lustre/S/nanziyuan/projects/codellama_test/src/eval.py -n {name} -s {nstart} -e {nend}
#- End
echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
#!/bin/bash
#SBATCH -J {name} # The job name
#SBATCH -o ret-{name}-%j.out # Write the standard output to file named 'ret-<job_number>.out'
#SBATCH -e ret-{name}-%j.err # Write the standard error to file named 'ret-<job_number>.err'
#SBATCH -p r8nv-gpu-hw # Submit to 'r8nv-gpu-hw' Partitiion
#SBATCH -t 0-12:00:00 # Run for a maximum time of 0 days, 12 hours, 00 mins, 00 secs
#SBATCH --nodes=1 # Request N nodes
#SBATCH --gres=gpu:{gpu_num} # Request M GPU per node
#SBATCH --ntasks-per-node=1 # Request P tasks per node
#SBATCH --cpus-per-task=128 # Request Q core per task; means that P*Q cores per node
#SBATCH --qos=gpu-short # Request QOS Type
#SBATCH --constraint="A100&40G&HGX8A100"
export USER_GPUS_PER_NODE={gpu_num} # <--------------------- Modify it in time!
export USER_NGPUS=$(($USER_GPUS_PER_NODE*$SLURM_JOB_NUM_NODES))
nodelist_h_format=$(scontrol show hostnames $SLURM_JOB_NODELIST | \
awk -v gpu=$USER_GPUS_PER_NODE '{{printf ((NR>1?",":"")$0":%s"), gpu}}')
if [[ -z $SLURM_NTASKS ]]; then
echo "SLURM_NTASKS is empty, please check your SBATCH parameter."
exit -1
fi
if [[ -z $SLURM_NTASKS_PER_NODE ]]; then
echo "SLURM_NTASKS_PER_NODE is empty, please check your SBATCH parameter."
exit -1
fi
task_size=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE))
if [[ $task_size != $SLURM_NTASKS ]]; then
echo "NTASKS_PER_NODE * NNODE != NNTASK, please check your SBATCH parameter."
exit -1
fi
if [[ $task_size != $USER_NGPUS ]]; then
echo "INFO..."
echo "That's a total of $SLURM_NTASKS tasks, requiring a total of $USER_NGPUS GPUs"
echo "Becareful whether your program requires \$SLURM_NTASKS or NGPUS"
fi
#- Global Info
master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
export MASTER_ADDR=$master_addr
export MASTER_PORT=$(expr 50000 + $(echo -n $SLURM_JOBID | tail -c 4))
export WORLD_SIZE=${{USER_NGPUS}}
export NCCL_IB_DISABLE=0
export NCCL_P2P_DISABLE=0
export NCCL_IB_CUDA_SUPPORT=1
export NCCL_NET_GDR_LEVEL=2
export NCCL_IB_HCA="mlx5_0,mlx5_1,mlx5_2,mlx5_3"
#- Log information
echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX "
echo "Nodelist:= " $SLURM_JOB_NODELIST
echo "Nodelistname:= " $nodelist_h_format
echo "Number of nodes:= " $SLURM_JOB_NUM_NODES
echo "Ntasks per node:= " $SLURM_NTASKS_PER_NODE
echo "Ntasks of jobs:= " $SLURM_NTASKS
echo "NGPUs of jobs:= " $USER_NGPUS
echo "MASTER_ADDR:= " $MASTER_ADDR
echo "MASTER_PORT:= " $MASTER_PORT
echo "WORLD_SIZE:= " $WORLD_SIZE
echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX "
echo "Job start at $(date "+%Y-%m-%d %H:%M:%S")"
echo "The job is triggered on node:"
echo "$(hostnamectl)"
#- Load environments
source /tools/module_env.sh
module list # list modules loaded
#- Tools
module load cluster-tools/v1.0
module load slurm-tools/v1.0
echo "$(df -h | grep -v tmpfs)"
cluster-quota # nas quota
#- Job step
# (DONE) Be sure to modify the template.multi-gpus-task.sh file as well.
echo "=============== srun begins =================="
srun bash run.sh {cfg_path}
#- End
echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
from functools import partial
import json
import logging
from pathlib import Path
from vllm import LLM, SamplingParams
def read_config(cfg_path):
with open(cfg_path, "r") as f:
return json.load(f)
def load_tasks(cfg):
"""
format: {"task_id:"..., "prompt":...}
"""
with open(cfg["testset_path"], "r", encoding='utf-8') as f:
tasks = [json.loads(l) for l in f.readlines()]
return tasks[cfg["testset_start"]:]
def load_llm(cfg):
"""
vllm is not a stable library. Watch its updates.
"""
llm = LLM(model=cfg["llm_path"],
gpu_memory_utilization=0.9,
swap_space=4, # https://github.com/vllm-project/vllm/issues/787#issuecomment-1876636749
enforce_eager=True,
tensor_parallel_size=cfg["num_gpus"])
sampling_params = SamplingParams(n=1,
temperature=0.8,
top_p=0.95,
max_tokens=2048,
stop=cfg["stop"])
return partial(llm.generate, sampling_params=sampling_params, use_tqdm=False)
def get_results(outputs):
return [o.outputs[0].text for o in outputs]
def save_jsonl(completion_dir, tid, outputs):
tid_path = tid.replace("/", "-")
path = Path(completion_dir) / f"{tid_path}.jsonl"
lines = [
json.dumps({"task_id": tid, "completion": o.text}) + '\n'
for request_output in outputs
for o in request_output.outputs
]
with open(path, "w", encoding="utf-8") as f:
f.writelines(lines)
def main(cfg_path):
"""
save the results after each task is generated.
The process may be terminated at any time.
"""
cfg = read_config(cfg_path)
tasks = load_tasks(cfg)
complete = load_llm(cfg)
total_num = len(tasks)
for i, task in enumerate(tasks):
tid, prompt = task["task_id"], task["prompt"]
logging.info(f"{i:03}/{total_num:03}: {tid}")
inputs = [prompt] * cfg["k"]
outputs = complete(inputs)
save_jsonl(cfg["completion_dir"], tid, outputs)
if __name__ == "__main__":
logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('cfgpath', type=Path)
args = parser.parse_args()
main(args.cfgpath)
from dataclasses import dataclass, asdict
import json
from pathlib import Path
import shutil
PROJ = Path("/lustre/S/nanziyuan/projects/llmtest")
EXP = PROJ / "exps"
TEST = PROJ / "testsets"
def get_llm_path(llm_name):
codellama_path = Path("/lustre/models/CodeLlama_240112/")
codellama70b_path = Path("/lustre/S/nanziyuan/projects/llmtest/models")
llm_dicts = {
"7b_base": codellama_path / "codellama-CodeLlama-7b-hf",
"7b_python": codellama_path / "codellama-CodeLlama-7b-Python-hf",
"7b_instr": codellama_path / "CodeLlama-7b-Instruct-hf",
"13b_base": codellama_path / "codellama-CodeLlama-13b-hf",
"13b_python": codellama_path / "codellama-CodeLlama-13b-Python-hf",
"13b_instr": codellama_path / "CodeLlama-13b-Instruct-hf",
"34b_base": codellama_path / "CodeLlama-34b-hf",
"34b_python": codellama_path / "codellama-CodeLlama-34b-Python-hf",
"34b_instr": codellama_path / "CodeLlama-34b-Instruct-hf",
"70b_base": codellama70b_path / "70b_base",
"70b_python": codellama70b_path / "70b_python",
"70b_instr": codellama70b_path / "70b_instruct",
}
return llm_dicts[llm_name].as_posix()
def get_testset_path(testset_name):
testset_dicts = {
"apps_subset": TEST / "apps_subset" / "apps_subset_prompt.jsonl",
"humaneval": TEST / "humaneval" / "humaneval_prompt.jsonl",
}
return testset_dicts[testset_name].as_posix()
@dataclass
class Config:
testset_path: str
llm_path: str
completion_dir: str
k: int
num_gpus: int
stop: list[str]
testset_start: int = 0
def save_config(cfg: Config, path: Path):
with open(path, "w") as f:
json.dump(asdict(cfg), f)
def mk_slurm(llm_name, gpu_num, cfg_path, slurm_dir):
with open("gpu.slurm.template", "r") as f:
template = f.read()
slurm_script = template.format(
name=llm_name,
gpu_num=gpu_num,
cfg_path=cfg_path.absolute()
)
with open(slurm_dir / "gpu.slurm", "w") as f:
f.write(slurm_script)
shutil.copy("./run.sh", slurm_dir / "run.sh")
def mk_config(llm_name, testset_name, exp_id):
path = EXP / f"{llm_name}-{testset_name}-{exp_id:0>3}"
path.mkdir()
completion_dir = path / "completion/"
completion_dir.mkdir()
slurm_dir = path / "slurm"
slurm_dir.mkdir()
num_gpus = 4
cfg = Config(
testset_path=get_testset_path(testset_name),
llm_path=get_llm_path(llm_name),
completion_dir=completion_dir.as_posix(),
k=1000,
num_gpus=num_gpus,
stop=["def"],
)
cfg_path = path / "config.json"
save_config(cfg, cfg_path)
mk_slurm(llm_name, num_gpus, cfg_path, slurm_dir)
if __name__ == "__main__":
mk_config("70b_base", "apps_subset", 0)
from pathlib import Path
PROJ = Path("/lustre/S/nanziyuan/projects/llmtest")
EXP = PROJ / "exps"
def merge_jsonl(completion_path, target_path):
dir = Path(completion_path)
with open(target_path / "samples.jsonl", "w") as outfile:
for f in dir.glob("*.jsonl"):
print(f)
with open(f, "r") as infile:
outfile.write(infile.read())
def mk_test_slurm(exp_path):
name = Path(exp_path).stem
eval_path = EXP / f"{name}-test"
completion_path = exp_path / "completion"
merge_jsonl(completion_path, eval_path)
#!/bin/bash
#- Log infomation
node_task_msg="
>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
Task run on: $(hostname -s), PID: ${SLURM_TASK_PID},
USE GPU ${CUDA_VISIBLE_DEVICES} of this node (GPUs_PER_Node, not PER_Task);
GlobalID : $SLURM_PROCID of $SLURM_NTASKS,
NodeID : $SLURM_NODEID of $SLURM_JOB_NUM_NODES,
LocalID : $SLURM_LOCALID of $SLURM_NTASKS_PER_NODE;
GPUs_PER_Task = $USER_NGPUS / $SLURM_NTASKS = $(($USER_NGPUS/$SLURM_NTASKS)),
MASTER_ADDR = $MASTER_ADDR
MASTER_PORT = $MASTER_PORT
WORLD_SIZE = $WORLD_SIZE
$(nvidia-smi --format=csv --query-gpu=name,driver_version,power.limit)
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
"
echo $node_task_msg
#- Important setting,
# otherwise it will cause an error of insufficient RDMA resources
ulimit -l unlimited
ulimit -s unlimited
ulimit -v unlimited
ulimit -u 4126448
ulimit -a
#- Load environments
module unload cuda-cudnn
source /tools/module_env.sh
##- CUDA
module load cuda-cudnn/12.1-8.9.3
##- virtualenv
/workspace/S/nanziyuan/miniconda3/bin/activate base
echo "Task $SLURM_PROCID: "$(module list) # list modules loaded
echo "Task $SLURM_PROCID: "$(which gcc)
echo "Task $SLURM_PROCID: "$(which python)
echo "Task $SLURM_PROCID: "$(which python3)
#- Warning! Please not change your CUDA_VISIBLE_DEVICES
#- in `.bashrc`, `env.sh`, or your job script
echo "Node $SLURM_NODEID, LocalID $SLURM_LOCALID: Use GPU ${CUDA_VISIBLE_DEVICES}"
#- The CUDA_VISIBLE_DEVICES variable is assigned and specified by SLURM
#- Job step
python /lustre/S/nanziyuan/projects/llmtest/src/inference.py $1
#- End
echo "Task $SLURM_PROCID end at $(date "+%Y-%m-%d %H:%M:%S") on $(hostname -s)"
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment