nsight example

70bd660e · songxinkai · fc4b5568 · 70bd660e · 70bd660e · 70bd660e
Commit 70bd660e authored Feb 14, 2023 by songxinkai
12 changed files
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,5 @@
 *bin.finish
 bazel-*
 build
+*.err
+*.out
--- a/c++/time/main.cpp
+++ b/c++/time/main.cpp
@@ -8,7 +8,7 @@ using namespace std;
 int main(){
  time_t a = 0, b;
  time(&a); // time.h
-  sleep(2); // unistd.h
+  sleep(2.11111); // unistd.h
  time(&b);
  cout << a << ", " << b << endl;
  cout << b -a << endl;
@@ -24,7 +24,7 @@ int main(){
  cout << float(clc_b - clc_a) / CLOCKS_PER_SEC << endl;

  clc_a=clock(); // time.h
-  sleep(2); // NO CPU clock during sleep
+  sleep(2.4); // NO CPU clock during sleep
  clc_b=clock();
  cout << clc_a << ", " << clc_b <<", CLOCKS_PER_SEC: " << CLOCKS_PER_SEC<< endl;
  cout << (clc_b - clc_a) / CLOCKS_PER_SEC << endl;

--- a/cuda/add.cu
+++ b/cuda/add.cu
@@ -18,7 +18,7 @@ using std::endl;
    } \
    cout << vec[len - 1] << "}" << endl;

-#define LEN 34
+#define LEN 102400

 // kernel functions
 template<typename Dtype>
@@ -33,9 +33,9 @@ __global__ void add_kernel(const int N, const Dtype* a, const Dtype* b, Dtype* c

 int main(){
    // host memory malloc & initial
-    int* host_a = new int[LEN];
-    int* host_b = new int[LEN];
-    int* host_c = new int[LEN];
+    float* host_a = new float[LEN];
+    float* host_b = new float[LEN];
+    float* host_c = new float[LEN];
    for (int i = 0; i < LEN; ++i){
        host_a[i] = i;
        host_b[i] = i * 100;
@@ -43,14 +43,14 @@ int main(){
    }
    
    // GPU device start
-    int device_id = 2;
+    int device_id = 0;
    CUDA_CHECK(cudaSetDevice(device_id));
    cout << "Using GPU " << device_id << "." << endl;
    
    // cudaMalloc & cudaMemcpy & cudaMemset
-    int* dev_a;
-    int* dev_b;
-    int* dev_c;
+    float* dev_a;
+    float* dev_b;
+    float* dev_c;
    CUDA_CHECK(cudaMalloc((void**)&dev_a, LEN * sizeof(int)));
    CUDA_CHECK(cudaMalloc((void**)&dev_b, LEN * sizeof(int)));
    CUDA_CHECK(cudaMalloc((void**)&dev_c, LEN * sizeof(int)));
@@ -61,10 +61,9 @@ int main(){
    // add_kernel & result copy & print
    dim3 grid_dim(1, 1, 1);   // gridDim.x,  gridDim.y,  gridDim.z (always 1)
    dim3 block_dim(16, 1, 1); // blockDim.x, blockDim.y, blockDim.z
-    add_kernel<int><<<grid_dim, block_dim>>>(LEN, dev_a, dev_b, dev_c);
-    //add_kernel<<<1, 16>>>(LEN, dev_a, dev_b, dev_c); // Set gridDim.x & blockDim.x
+    add_kernel<float><<<grid_dim, block_dim>>>(LEN, dev_a, dev_b, dev_c);
    CUDA_CHECK(cudaMemcpy(host_c, dev_c, LEN * sizeof(int), cudaMemcpyDeviceToHost));
-    VECTOR_PRINT("add_kernel results", host_c, LEN);
+    VECTOR_PRINT("add_kernel results", host_c, 10);

    // Free gpu memory & free cpu memory
    CUDA_CHECK(cudaFree(dev_a));

--- a/cuda/argmax.cu
+++ b/cuda/argmax.cu
+#include <cuda_runtime.h>
+#include <iostream>
+
+using std::cin;
+using std::cout;
+using std::endl;
+
+#define CUDA_CHECK(x) \
+    { cudaError_t cuda_error = x; \
+        if (cuda_error != cudaSuccess) \
+            cout << "cudaError_t: " << cuda_error << " != 0 " \
+                 << cudaGetErrorString(cuda_error) << endl; \
+    }
+
+#define VECTOR_PRINT(head_str, vec, len) \
+    cout << head_str << ": {"; \
+    for (int i = 0; i < len - 1; ++i){ \
+        cout << vec[i] << ", "; \
+    } \
+    cout << vec[len - 1] << "}" << endl;
+
+#define LEN 1000000
+#define BLOCKDIM 512
+#define GRIDDIM 80
+
+// kernel function
+__global__ void argmax_kernel(int N, int *a, int *c ) { 
+    __shared__ int cache[BLOCKDIM]; // 512 >= thread_id in block
+    int tid = threadIdx.x + blockIdx.x * blockDim.x; // thread_id in grid
+    int cacheIndex = threadIdx.x; // thread_id in block
+    // thread_num in grid
+
+    int temp_maxidx = tid;
+    while (tid < N) {
+        if (a[tid] > a[temp_maxidx]){
+            temp_maxidx = tid;
+        }
+        tid += blockDim.x * gridDim.x;
+    }
+
+    cache[cacheIndex] = temp_maxidx;//if blockDim == 1, then result = the sum of cache[].
+
+    //同步
+    __syncthreads();//make sure that all the threads in a block finish the procedure above
+
+    //规约求和
+    int i = blockDim.x/2;
+    while (i != 0) {
+        if (cacheIndex < i) {
+            if (a[cache[cacheIndex + i]] > a[cache[cacheIndex]]){
+                cache[cacheIndex] = cache[cacheIndex + i];
+            }
+        }
+
+        __syncthreads();
+        i /= 2;
+    }
+
+    if (cacheIndex == 0) {
+        c[blockIdx.x] = cache[0];
+    }
+}
+
+int main() {
+    // host memory malloc & initial
+    int* host_a = new int[LEN];
+    int* host_c = new int[GRIDDIM];
+    for (int i = 0; i < LEN; ++i) {
+        host_a[i] = i;
+    }
+    for (int i = 0; i < GRIDDIM+1; ++i) {
+        host_c[i] = 0;
+    }
+
+    // GPU device start
+    int device_id = 1;
+    CUDA_CHECK(cudaSetDevice(device_id));
+    cout << "Using GPU " << device_id << "." << endl;
+
+    // cudaMalloc & cudaMemcpy & cudaMemset
+    int* dev_a;
+    int* dev_c;
+    CUDA_CHECK(cudaMalloc((void**)&dev_a, LEN * sizeof(int)));
+    CUDA_CHECK(cudaMalloc((void**)&dev_c, (GRIDDIM+1) * sizeof(int)));
+    cudaEvent_t start, end;
+    CUDA_CHECK(cudaEventCreate(&start));
+    CUDA_CHECK(cudaEventCreate(&end));
+    cudaEventRecord(start);
+    CUDA_CHECK(cudaMemcpy(dev_a, host_a, LEN * sizeof(int), cudaMemcpyHostToDevice));
+    cudaEventRecord(end);
+    cudaEventSynchronize(end);
+    // 统计时间
+    float time_ms = 0.f;
+    cudaEventElapsedTime(&time_ms, start, end);
+    std::cout << "CUDA Kernel time: " << time_ms << " ms" << std::endl;
+    CUDA_CHECK(cudaMemset(dev_c, 0, (GRIDDIM+1) * sizeof(int)));
+
+    // add_kernel & result copy & print
+    dim3 grid_dim(GRIDDIM, 1, 1);   // gridDim.x,  gridDim.y,  gridDim.z
+    dim3 block_dim(BLOCKDIM, 1, 1); // blockDim.x, blockDim.y, blockDim.z
+    const int blocksPerGrid = grid_dim.x * grid_dim.y * grid_dim.z;
+    argmax_kernel<<<grid_dim, block_dim>>>(LEN, dev_a, dev_c);
+    CUDA_CHECK(cudaMemcpy(host_c, dev_c, (GRIDDIM+1) * sizeof(int), cudaMemcpyDeviceToHost));
+    VECTOR_PRINT("c", host_c, GRIDDIM);
+
+    // Free gpu memory & free cpu memory
+    CUDA_CHECK(cudaFree(dev_a));
+    CUDA_CHECK(cudaFree(dev_c));
+    delete[] host_a;
+    delete[] host_c;
+    return 0;
+}
--- a/cuda/common_device.h
+++ b/cuda/common_device.h
--- a/cuda/compile.sh
+++ b/cuda/compile.sh
+#!/bin/bash
+
+nvcc \
+    -L/workspace/S/songxinkai/local/TensorRT-8.5.1.7/lib \
+    -I/workspace/S/songxinkai/local/TensorRT-8.5.1.7/include \
+    -I/tools/cluster-software/cuda-cudnn/cuda-11.1-8.0.5/include \
+    -lnvinfer \
+    tensorrt_cudastream_example0.cpp
--- a/cuda/demo.sh
+++ b/cuda/demo.sh
+#!/bin/bash
+
+#- Job parameters
+
+# (TODO)
+# Please modify job name
+
+#SBATCH -J test              # The job name
+#SBATCH -o ret-%j.out        # Write the standard output to file named 'ret-<job_number>.out'
+#SBATCH -e ret-%j.err        # Write the standard error to file named 'ret-<job_number>.err'
+
+
+#- Needed resources
+
+# (TODO)
+# Please modify your requirements
+
+#SBATCH -p nv-gpu#,nv-gpu-hw          # Submit to 'nv-gpu' and 'nv-gpu-hw' Partitiion
+#SBATCH -t 0-8:00:00                # Run for a maximum time of 0 days, 12 hours, 00 mins, 00 secs
+#SBATCH --nodes=1                    # Request N nodes
+#SBATCH --gres=gpu:1                 # Request M GPU per node
+#SBATCH --gres-flags=enforce-binding # CPU-GPU Affinity
+#SBATCH --constraint="Ampere" # Request GPU Type: Volta(V100 or V100S) or RTX8000, Ampere
+
+###
+### The system will alloc 8 cores per gpu by default.
+### If you need more or less, use following:
+### #SBATCH --cpus-per-task=K        # Request K cores
+###
+
+#SBATCH --qos=gpu-short                 # Request QOS Type
+
+#- Operstions
+echo "Job start at $(date "+%Y-%m-%d %H:%M:%S")"
+echo "Job run at:"
+echo "$(hostnamectl)"
+
+#- Load environments
+source /tools/module_env.sh
+module list                       # list modules loaded by default
+
+##- tools
+module load cluster-tools/v1.0
+module load cmake/3.15.7
+module load git/2.17.1
+module load vim/8.1.2424
+
+##- language
+module load python3/3.6.8
+
+##- cuda
+module load cuda-cudnn/11.0-8.0.4
+
+##- virtualenv
+# source xxxxx/activate
+
+#- Log information
+
+echo $(module list)              # list modules loaded
+echo $(which gcc)
+echo $(which python)
+echo $(which python3)
+
+cluster-quota                    # nas quota
+
+nvidia-smi --format=csv --query-gpu=name,driver_version,power.limit # gpu info
+
+echo "Use GPU ${CUDA_VISIBLE_DEVICES}$"                             # which gpus
+#- Warning! Please not change your CUDA_VISIBLE_DEVICES
+#- in `.bashrc`, `env.sh`, or your job script
+
+#- Job step
+sleep 28800
+
+#- End
+echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
--- a/cuda/log
+++ b/cuda/log
+Using GPU 0.
+==PROF== Connected to process 63252 (/workspace/S/songxinkai/projects/mytests/cuda/a.out)
+==PROF== Profiling "add_kernel" - 1: 0%....50%....100% - 1 pass
+add_kernel results: {0, 101, 202, 303, 404, 505, 606, 707, 808, 909}
+==PROF== Disconnected from process 63252
+[63252] a.out@127.0.0.1
+  void add_kernel<float>(int, float const*, float const*, float*), 2023-Feb-13 00:36:34, Context 1, Stream 7
+    Section: Command line profiler metrics
+    ---------------------------------------------------------------------- --------------- ------------------------------
+    dram__bytes_read.avg                                                             Kbyte                          20.58
+    dram__bytes_read.max                                                             Kbyte                          24.58
+    dram__bytes_read.min                                                             Kbyte                          18.43
+    dram__bytes_read.sum                                                             Kbyte                         823.04
+    dram__bytes_write.avg                                                             byte                              0
+    dram__bytes_write.max                                                             byte                              0
+    dram__bytes_write.min                                                             byte                              0
+    dram__bytes_write.sum                                                             byte                              0
+    fbpa__dram_read_bytes.avg                                                        Kbyte                          41.15
+    fbpa__dram_read_bytes.max                                                        Kbyte                          45.06
+    fbpa__dram_read_bytes.min                                                        Kbyte                          36.86
+    fbpa__dram_read_bytes.sum                                                        Kbyte                         823.04
+    fbpa__dram_write_bytes.avg                                                        byte                              0
+    fbpa__dram_write_bytes.max                                                        byte                              0
+    fbpa__dram_write_bytes.min                                                        byte                              0
+    fbpa__dram_write_bytes.sum                                                        byte                              0
+    ---------------------------------------------------------------------- --------------- ------------------------------
+
--- a/cuda/nsight/help
+++ b/cuda/nsight/help
--- a/cuda/run_ncu.sh
+++ b/cuda/run_ncu.sh
+#!/bin/bash
+
+#nvcc add.cu && \
+#nv-nsight-cu-cli \
+#    a.out
+# nvcc add.cu && \
+# /tools/cluster-software/cuda-cudnn/cuda-11.1-8.0.5/nsight-compute/2020.2.1/nv-nsight-cu-cli \
+#     --target-processes all \
+#     a.out
+#    --metrics dram_read_bytes,dram_write_bytes,smsp__sass_thread_inst_executed_op_dadd_pred_on,smsp__sass_thread_inst_executed_op_dfma_pred_on,smsp__sass_thread_inst_executed_op_dmul_pred_on,smsp__sass_thread_inst_executed_op_hadd_pred_on,smsp__sass_thread_inst_executed_op_hfma_pred_on,smsp__sass_thread_inst_executed_op_hmul_pred_on,smsp__sass_thread_inst_executed_op_fadd_pred_on,smsp__sass_thread_inst_executed_op_ffma_pred_on,smsp__sass_thread_inst_executed_op_fmul_pred_on \
+
+nvcc add.cu && \
+/tools/cluster-software/cuda-cudnn/cuda-11.1-8.0.5/nsight-compute/2020.2.1/nv-nsight-cu-cli \
+    --metrics dram_read_bytes,dram__bytes_read,dram_write_bytes,dram__bytes_write \
+    a.out
--- a/cuda/tensorrt_cudastream_example0.cpp
+++ b/cuda/tensorrt_cudastream_example0.cpp
--- a/cuda/thrust_max.cu
+++ b/cuda/thrust_max.cu
@@ -23,7 +23,7 @@ using std::endl;
    } \
    cout << vec[LEN - 1] << "}" << endl;

-#define LEN 32
+#define LEN 1000000

 template <typename Dtype>
 struct MAX_OP{