first commit

09dfbc06 · nanziyuan · 09dfbc06 · 09dfbc06 · 09dfbc06 · 09dfbc06
Commit 09dfbc06 authored Mar 21, 2024 by nanziyuan
Show whitespace changes
Inline Side-by-side

Showing with 394 additions and 0 deletions

.gitignore
+2 -0

cpu.slurm.template
+60 -0

gpu.slurm.template
+92 -0

inference.py
+75 -0

mk_config.py
+93 -0

mk_eval.py
+18 -0

run.sh
+54 -0

samples.jsonl
+0 -0

No files found.
--- a/.gitignore
+++ b/.gitignore
+__pycache__/*
+run.py
--- a/cpu.slurm.template
+++ b/cpu.slurm.template
+#!/bin/bash
+#- Job parameters
+# (TODO)
+# Please modify job name
+#SBATCH -J eval-{name}       # The job name
+#SBATCH -o ret-{name}-%j.out        # Write the standard output to file named 'ret-<job_number>.out'
+#SBATCH -e ret-{name}-%j.err        # Write the standard error to file named 'ret-<job_number>.err'
+#- Needed resources
+# (TODO)
+# Please modify your requirements
+#SBATCH -p cpu                       # Submit to 'nv-gpu' and 'nv-gpu-hw' Partitiion
+#SBATCH -t 1-12:00:00                 # Run for a maximum time of 0 days, 8 hours, 00 mins, 00 secs
+#SBATCH --nodes=1                    # Request N nodes
+#SBATCH --ntasks-per-node=1          # Request N*M tasks
+#SBATCH --cpus-per-task=32           # Request N*M*K cores
+#SBATCH --qos=cpu-normal             # Request QOS Type
+#- Operstions
+echo "Job start at $(date "+%Y-%m-%d %H:%M:%S")"
+echo "Job run at:"
+echo "$(hostnamectl)"
+#- CPU and Mem
+lscpu
+free -h -w
+#- Load environments
+source /tools/module_env.sh
+module list                       # list modules loaded by default
+#- tools
+module load cluster-tools/v1.0
+#- language
+##- virtualenv
+/lustre/S/nanziyuan/miniconda3/bin/activate srasm
+#- Log information
+echo $(module list)              # list modules loaded
+echo $(which gcc)
+echo $(which python)
+echo $(which python3)
+cluster-quota                    # nas quota
+#- Job step
+python /lustre/S/nanziyuan/projects/codellama_test/src/eval.py -n {name} -s {nstart} -e {nend}
+#- End
+echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
--- a/gpu.slurm.template
+++ b/gpu.slurm.template
+#!/bin/bash
+#SBATCH -J {name}                   # The job name
+#SBATCH -o ret-{name}-%j.out        # Write the standard output to file named 'ret-<job_number>.out'
+#SBATCH -e ret-{name}-%j.err        # Write the standard error to file named 'ret-<job_number>.err'
+#SBATCH -p r8nv-gpu-hw               # Submit to 'r8nv-gpu-hw' Partitiion
+#SBATCH -t 0-12:00:00                # Run for a maximum time of 0 days, 12 hours, 00 mins, 00 secs
+#SBATCH --nodes=1                    # Request N nodes
+#SBATCH --gres=gpu:{gpu_num}         # Request M GPU per node
+#SBATCH --ntasks-per-node=1          # Request P tasks per node
+#SBATCH --cpus-per-task=128          # Request Q core per task; means that P*Q cores per node
+#SBATCH --qos=gpu-short              # Request QOS Type
+#SBATCH --constraint="A100&40G&HGX8A100"
+export USER_GPUS_PER_NODE={gpu_num}          # <--------------------- Modify it in time!
+export USER_NGPUS=$(($USER_GPUS_PER_NODE*$SLURM_JOB_NUM_NODES))
+nodelist_h_format=$(scontrol show hostnames $SLURM_JOB_NODELIST | \
+    awk -v gpu=$USER_GPUS_PER_NODE '{{printf ((NR>1?",":"")$0":%s"), gpu}}')
+if [[ -z $SLURM_NTASKS ]]; then
+    echo "SLURM_NTASKS is empty, please check your SBATCH parameter."
+    exit -1
+fi
+if [[ -z $SLURM_NTASKS_PER_NODE ]]; then
+    echo "SLURM_NTASKS_PER_NODE is empty, please check your SBATCH parameter."
+    exit -1
+fi
+task_size=$(($SLURM_NNODES * $SLURM_NTASKS_PER_NODE))
+if [[ $task_size != $SLURM_NTASKS ]]; then
+    echo "NTASKS_PER_NODE * NNODE != NNTASK, please check your SBATCH parameter."
+    exit -1
+fi
+if [[ $task_size != $USER_NGPUS ]]; then
+    echo "INFO..."
+    echo "That's a total of $SLURM_NTASKS tasks, requiring a total of $USER_NGPUS GPUs"
+    echo "Becareful whether your program requires \$SLURM_NTASKS or NGPUS"
+fi
+#- Global Info
+master_addr=$(scontrol show hostnames "$SLURM_JOB_NODELIST" | head -n 1)
+export MASTER_ADDR=$master_addr
+export MASTER_PORT=$(expr 50000 + $(echo -n $SLURM_JOBID | tail -c 4))
+export WORLD_SIZE=${{USER_NGPUS}}
+export NCCL_IB_DISABLE=0
+export NCCL_P2P_DISABLE=0
+export NCCL_IB_CUDA_SUPPORT=1
+export NCCL_NET_GDR_LEVEL=2
+export NCCL_IB_HCA="mlx5_0,mlx5_1,mlx5_2,mlx5_3"
+#- Log information
+echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX "
+echo "Nodelist:=        " $SLURM_JOB_NODELIST
+echo "Nodelistname:=    " $nodelist_h_format
+echo "Number of nodes:= " $SLURM_JOB_NUM_NODES
+echo "Ntasks per node:= " $SLURM_NTASKS_PER_NODE
+echo "Ntasks of jobs:=  " $SLURM_NTASKS
+echo "NGPUs of jobs:=   " $USER_NGPUS
+echo "MASTER_ADDR:=     " $MASTER_ADDR
+echo "MASTER_PORT:=     " $MASTER_PORT
+echo "WORLD_SIZE:=      " $WORLD_SIZE
+echo "XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX "
+echo "Job start at $(date "+%Y-%m-%d %H:%M:%S")"
+echo "The job is triggered on node:"
+echo "$(hostnamectl)"
+#- Load environments
+source /tools/module_env.sh
+module list                       # list modules loaded
+#- Tools
+module load cluster-tools/v1.0
+module load slurm-tools/v1.0
+echo "$(df -h | grep -v tmpfs)"
+cluster-quota                     # nas quota
+#- Job step
+# (DONE) Be sure to modify the template.multi-gpus-task.sh file as well.
+echo "=============== srun begins =================="
+srun bash run.sh {cfg_path}
+#- End
+echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
--- a/inference.py
+++ b/inference.py
+from functools import partial
+import json
+import logging
+from pathlib import Path
+from vllm import LLM, SamplingParams
+def read_config(cfg_path):
+    with open(cfg_path, "r") as f:
+        return json.load(f)
+def load_tasks(cfg):
+    """
+    format: {"task_id:"..., "prompt":...}
+    """
+    with open(cfg["testset_path"], "r", encoding='utf-8') as f:
+        tasks = [json.loads(l) for l in f.readlines()]
+    return tasks[cfg["testset_start"]:]
+def load_llm(cfg):
+    """
+    vllm is not a stable library. Watch its updates.
+    """
+    llm = LLM(model=cfg["llm_path"],
+              gpu_memory_utilization=0.9,
+              swap_space=4, # https://github.com/vllm-project/vllm/issues/787#issuecomment-1876636749
+              enforce_eager=True,
+              tensor_parallel_size=cfg["num_gpus"])
+    sampling_params = SamplingParams(n=1,
+                                     temperature=0.8,
+                                     top_p=0.95,
+                                     max_tokens=2048,
+                                     stop=cfg["stop"])
+    return partial(llm.generate, sampling_params=sampling_params, use_tqdm=False)
+def get_results(outputs):
+    return [o.outputs[0].text for o in outputs]
+def save_jsonl(completion_dir, tid, outputs):
+    tid_path = tid.replace("/", "-")
+    path = Path(completion_dir) / f"{tid_path}.jsonl"
+    lines = [
+        json.dumps({"task_id": tid, "completion": o.text}) + '\n'
+        for request_output in outputs
+        for o in request_output.outputs
+    ]
+    with open(path, "w", encoding="utf-8") as f:
+        f.writelines(lines)
+def main(cfg_path):
+    """
+    save the results after each task is generated.
+    The process may be terminated at any time.
+    """
+    cfg = read_config(cfg_path)
+    tasks = load_tasks(cfg)
+    complete = load_llm(cfg)
+    total_num = len(tasks)
+    for i, task in enumerate(tasks):
+        tid, prompt = task["task_id"], task["prompt"]
+        logging.info(f"{i:03}/{total_num:03}: {tid}")
+        inputs = [prompt] * cfg["k"]
+        outputs = complete(inputs)
+        save_jsonl(cfg["completion_dir"], tid, outputs)
+if __name__ == "__main__":
+    logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s", level=logging.INFO)
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('cfgpath', type=Path)
+    args = parser.parse_args()
+    main(args.cfgpath)
--- a/mk_config.py
+++ b/mk_config.py
+from dataclasses import dataclass, asdict
+import json
+from pathlib import Path
+import shutil
+PROJ = Path("/lustre/S/nanziyuan/projects/llmtest")
+EXP = PROJ / "exps"
+TEST = PROJ / "testsets"
+def get_llm_path(llm_name):
+    codellama_path = Path("/lustre/models/CodeLlama_240112/")
+    codellama70b_path = Path("/lustre/S/nanziyuan/projects/llmtest/models")
+    llm_dicts = {
+        "7b_base": codellama_path / "codellama-CodeLlama-7b-hf",
+        "7b_python": codellama_path / "codellama-CodeLlama-7b-Python-hf",
+        "7b_instr": codellama_path / "CodeLlama-7b-Instruct-hf",
+        "13b_base": codellama_path / "codellama-CodeLlama-13b-hf",
+        "13b_python": codellama_path / "codellama-CodeLlama-13b-Python-hf",
+        "13b_instr": codellama_path / "CodeLlama-13b-Instruct-hf",
+        "34b_base": codellama_path / "CodeLlama-34b-hf",
+        "34b_python": codellama_path / "codellama-CodeLlama-34b-Python-hf",
+        "34b_instr": codellama_path / "CodeLlama-34b-Instruct-hf",
+        "70b_base": codellama70b_path / "70b_base",
+        "70b_python": codellama70b_path / "70b_python",
+        "70b_instr": codellama70b_path / "70b_instruct",
+    }
+    return llm_dicts[llm_name].as_posix()
+def get_testset_path(testset_name):
+    testset_dicts = {
+        "apps_subset": TEST / "apps_subset" / "apps_subset_prompt.jsonl",
+        "humaneval":  TEST / "humaneval" / "humaneval_prompt.jsonl",
+    }
+    return testset_dicts[testset_name].as_posix()
+@dataclass
+class Config:
+    testset_path: str
+    llm_path: str
+    completion_dir: str
+    k: int
+    num_gpus: int
+    stop: list[str]
+    testset_start: int = 0
+def save_config(cfg: Config, path: Path):
+    with open(path, "w") as f:
+        json.dump(asdict(cfg), f)
+def mk_slurm(llm_name, gpu_num, cfg_path, slurm_dir):
+    with open("gpu.slurm.template", "r") as f:
+        template = f.read()
+    slurm_script = template.format(
+        name=llm_name,
+        gpu_num=gpu_num,
+        cfg_path=cfg_path.absolute()
+    )
+    with open(slurm_dir / "gpu.slurm", "w") as f:
+        f.write(slurm_script)
+    shutil.copy("./run.sh", slurm_dir / "run.sh")
+def mk_config(llm_name, testset_name, exp_id):
+    path = EXP / f"{llm_name}-{testset_name}-{exp_id:0>3}"
+    path.mkdir()
+    completion_dir = path / "completion/"
+    completion_dir.mkdir()
+    slurm_dir = path / "slurm"
+    slurm_dir.mkdir()
+    num_gpus = 4
+    cfg = Config(
+        testset_path=get_testset_path(testset_name),
+        llm_path=get_llm_path(llm_name),
+        completion_dir=completion_dir.as_posix(),
+        k=1000,
+        num_gpus=num_gpus,
+        stop=["def"],
+    )
+    cfg_path = path / "config.json"
+    save_config(cfg, cfg_path)
+    mk_slurm(llm_name, num_gpus, cfg_path, slurm_dir)
+if __name__ == "__main__":
+    mk_config("70b_base", "apps_subset", 0)
--- a/mk_eval.py
+++ b/mk_eval.py
+from pathlib import Path
+PROJ = Path("/lustre/S/nanziyuan/projects/llmtest")
+EXP = PROJ / "exps"
+def merge_jsonl(completion_path, target_path):
+    dir = Path(completion_path)
+    with open(target_path / "samples.jsonl", "w") as outfile:
+        for f in dir.glob("*.jsonl"):
+            print(f)
+            with open(f, "r") as infile:
+                outfile.write(infile.read())
+def mk_test_slurm(exp_path):
+    name = Path(exp_path).stem
+    eval_path = EXP / f"{name}-test"
+    completion_path = exp_path / "completion"
+    merge_jsonl(completion_path, eval_path)
--- a/run.sh
+++ b/run.sh
+#!/bin/bash
+#- Log infomation
+node_task_msg="
+>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+Task run on: $(hostname -s), PID: ${SLURM_TASK_PID},
+USE GPU ${CUDA_VISIBLE_DEVICES} of this node (GPUs_PER_Node, not PER_Task);
+GlobalID : $SLURM_PROCID    of $SLURM_NTASKS,
+NodeID   : $SLURM_NODEID    of $SLURM_JOB_NUM_NODES,
+LocalID  : $SLURM_LOCALID    of $SLURM_NTASKS_PER_NODE;
+GPUs_PER_Task = $USER_NGPUS / $SLURM_NTASKS = $(($USER_NGPUS/$SLURM_NTASKS)),
+MASTER_ADDR   = $MASTER_ADDR
+MASTER_PORT   = $MASTER_PORT
+WORLD_SIZE    = $WORLD_SIZE
+$(nvidia-smi --format=csv --query-gpu=name,driver_version,power.limit)
+<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+"
+echo $node_task_msg
+#- Important setting,
+#  otherwise it will cause an error of insufficient RDMA resources
+ulimit -l unlimited
+ulimit -s unlimited
+ulimit -v unlimited
+ulimit -u 4126448
+ulimit -a
+#- Load environments
+module unload cuda-cudnn
+source /tools/module_env.sh
+##- CUDA
+module load cuda-cudnn/12.1-8.9.3
+##- virtualenv
+/workspace/S/nanziyuan/miniconda3/bin/activate base
+echo "Task $SLURM_PROCID: "$(module list)              # list modules loaded
+echo "Task $SLURM_PROCID: "$(which gcc)
+echo "Task $SLURM_PROCID: "$(which python)
+echo "Task $SLURM_PROCID: "$(which python3)
+#- Warning! Please not change your CUDA_VISIBLE_DEVICES
+#- in `.bashrc`, `env.sh`, or your job script
+echo "Node $SLURM_NODEID, LocalID $SLURM_LOCALID: Use GPU ${CUDA_VISIBLE_DEVICES}"
+#- The CUDA_VISIBLE_DEVICES variable is assigned and specified by SLURM
+#- Job step
+python /lustre/S/nanziyuan/projects/llmtest/src/inference.py $1
+#- End
+echo "Task $SLURM_PROCID end at $(date "+%Y-%m-%d %H:%M:%S") on $(hostname -s)"
--- a/samples.jsonl
+++ b/samples.jsonl