Commit 70bd660e by songxinkai

nsight example

parent fc4b5568
......@@ -5,3 +5,5 @@
*bin.finish
bazel-*
build
*.err
*.out
......@@ -8,7 +8,7 @@ using namespace std;
int main(){
time_t a = 0, b;
time(&a); // time.h
sleep(2); // unistd.h
sleep(2.11111); // unistd.h
time(&b);
cout << a << ", " << b << endl;
cout << b -a << endl;
......@@ -24,7 +24,7 @@ int main(){
cout << float(clc_b - clc_a) / CLOCKS_PER_SEC << endl;
clc_a=clock(); // time.h
sleep(2); // NO CPU clock during sleep
sleep(2.4); // NO CPU clock during sleep
clc_b=clock();
cout << clc_a << ", " << clc_b <<", CLOCKS_PER_SEC: " << CLOCKS_PER_SEC<< endl;
cout << (clc_b - clc_a) / CLOCKS_PER_SEC << endl;
......
......@@ -18,7 +18,7 @@ using std::endl;
} \
cout << vec[len - 1] << "}" << endl;
#define LEN 34
#define LEN 102400
// kernel functions
template<typename Dtype>
......@@ -33,9 +33,9 @@ __global__ void add_kernel(const int N, const Dtype* a, const Dtype* b, Dtype* c
int main(){
// host memory malloc & initial
int* host_a = new int[LEN];
int* host_b = new int[LEN];
int* host_c = new int[LEN];
float* host_a = new float[LEN];
float* host_b = new float[LEN];
float* host_c = new float[LEN];
for (int i = 0; i < LEN; ++i){
host_a[i] = i;
host_b[i] = i * 100;
......@@ -43,14 +43,14 @@ int main(){
}
// GPU device start
int device_id = 2;
int device_id = 0;
CUDA_CHECK(cudaSetDevice(device_id));
cout << "Using GPU " << device_id << "." << endl;
// cudaMalloc & cudaMemcpy & cudaMemset
int* dev_a;
int* dev_b;
int* dev_c;
float* dev_a;
float* dev_b;
float* dev_c;
CUDA_CHECK(cudaMalloc((void**)&dev_a, LEN * sizeof(int)));
CUDA_CHECK(cudaMalloc((void**)&dev_b, LEN * sizeof(int)));
CUDA_CHECK(cudaMalloc((void**)&dev_c, LEN * sizeof(int)));
......@@ -61,10 +61,9 @@ int main(){
// add_kernel & result copy & print
dim3 grid_dim(1, 1, 1); // gridDim.x, gridDim.y, gridDim.z (always 1)
dim3 block_dim(16, 1, 1); // blockDim.x, blockDim.y, blockDim.z
add_kernel<int><<<grid_dim, block_dim>>>(LEN, dev_a, dev_b, dev_c);
//add_kernel<<<1, 16>>>(LEN, dev_a, dev_b, dev_c); // Set gridDim.x & blockDim.x
add_kernel<float><<<grid_dim, block_dim>>>(LEN, dev_a, dev_b, dev_c);
CUDA_CHECK(cudaMemcpy(host_c, dev_c, LEN * sizeof(int), cudaMemcpyDeviceToHost));
VECTOR_PRINT("add_kernel results", host_c, LEN);
VECTOR_PRINT("add_kernel results", host_c, 10);
// Free gpu memory & free cpu memory
CUDA_CHECK(cudaFree(dev_a));
......
#include <cuda_runtime.h>
#include <iostream>
using std::cin;
using std::cout;
using std::endl;
#define CUDA_CHECK(x) \
{ cudaError_t cuda_error = x; \
if (cuda_error != cudaSuccess) \
cout << "cudaError_t: " << cuda_error << " != 0 " \
<< cudaGetErrorString(cuda_error) << endl; \
}
#define VECTOR_PRINT(head_str, vec, len) \
cout << head_str << ": {"; \
for (int i = 0; i < len - 1; ++i){ \
cout << vec[i] << ", "; \
} \
cout << vec[len - 1] << "}" << endl;
#define LEN 1000000
#define BLOCKDIM 512
#define GRIDDIM 80
// kernel function
__global__ void argmax_kernel(int N, int *a, int *c ) {
__shared__ int cache[BLOCKDIM]; // 512 >= thread_id in block
int tid = threadIdx.x + blockIdx.x * blockDim.x; // thread_id in grid
int cacheIndex = threadIdx.x; // thread_id in block
// thread_num in grid
int temp_maxidx = tid;
while (tid < N) {
if (a[tid] > a[temp_maxidx]){
temp_maxidx = tid;
}
tid += blockDim.x * gridDim.x;
}
cache[cacheIndex] = temp_maxidx;//if blockDim == 1, then result = the sum of cache[].
//同步
__syncthreads();//make sure that all the threads in a block finish the procedure above
//规约求和
int i = blockDim.x/2;
while (i != 0) {
if (cacheIndex < i) {
if (a[cache[cacheIndex + i]] > a[cache[cacheIndex]]){
cache[cacheIndex] = cache[cacheIndex + i];
}
}
__syncthreads();
i /= 2;
}
if (cacheIndex == 0) {
c[blockIdx.x] = cache[0];
}
}
int main() {
// host memory malloc & initial
int* host_a = new int[LEN];
int* host_c = new int[GRIDDIM];
for (int i = 0; i < LEN; ++i) {
host_a[i] = i;
}
for (int i = 0; i < GRIDDIM+1; ++i) {
host_c[i] = 0;
}
// GPU device start
int device_id = 1;
CUDA_CHECK(cudaSetDevice(device_id));
cout << "Using GPU " << device_id << "." << endl;
// cudaMalloc & cudaMemcpy & cudaMemset
int* dev_a;
int* dev_c;
CUDA_CHECK(cudaMalloc((void**)&dev_a, LEN * sizeof(int)));
CUDA_CHECK(cudaMalloc((void**)&dev_c, (GRIDDIM+1) * sizeof(int)));
cudaEvent_t start, end;
CUDA_CHECK(cudaEventCreate(&start));
CUDA_CHECK(cudaEventCreate(&end));
cudaEventRecord(start);
CUDA_CHECK(cudaMemcpy(dev_a, host_a, LEN * sizeof(int), cudaMemcpyHostToDevice));
cudaEventRecord(end);
cudaEventSynchronize(end);
// 统计时间
float time_ms = 0.f;
cudaEventElapsedTime(&time_ms, start, end);
std::cout << "CUDA Kernel time: " << time_ms << " ms" << std::endl;
CUDA_CHECK(cudaMemset(dev_c, 0, (GRIDDIM+1) * sizeof(int)));
// add_kernel & result copy & print
dim3 grid_dim(GRIDDIM, 1, 1); // gridDim.x, gridDim.y, gridDim.z
dim3 block_dim(BLOCKDIM, 1, 1); // blockDim.x, blockDim.y, blockDim.z
const int blocksPerGrid = grid_dim.x * grid_dim.y * grid_dim.z;
argmax_kernel<<<grid_dim, block_dim>>>(LEN, dev_a, dev_c);
CUDA_CHECK(cudaMemcpy(host_c, dev_c, (GRIDDIM+1) * sizeof(int), cudaMemcpyDeviceToHost));
VECTOR_PRINT("c", host_c, GRIDDIM);
// Free gpu memory & free cpu memory
CUDA_CHECK(cudaFree(dev_a));
CUDA_CHECK(cudaFree(dev_c));
delete[] host_a;
delete[] host_c;
return 0;
}
#!/bin/bash
nvcc \
-L/workspace/S/songxinkai/local/TensorRT-8.5.1.7/lib \
-I/workspace/S/songxinkai/local/TensorRT-8.5.1.7/include \
-I/tools/cluster-software/cuda-cudnn/cuda-11.1-8.0.5/include \
-lnvinfer \
tensorrt_cudastream_example0.cpp
#!/bin/bash
#- Job parameters
# (TODO)
# Please modify job name
#SBATCH -J test # The job name
#SBATCH -o ret-%j.out # Write the standard output to file named 'ret-<job_number>.out'
#SBATCH -e ret-%j.err # Write the standard error to file named 'ret-<job_number>.err'
#- Needed resources
# (TODO)
# Please modify your requirements
#SBATCH -p nv-gpu#,nv-gpu-hw # Submit to 'nv-gpu' and 'nv-gpu-hw' Partitiion
#SBATCH -t 0-8:00:00 # Run for a maximum time of 0 days, 12 hours, 00 mins, 00 secs
#SBATCH --nodes=1 # Request N nodes
#SBATCH --gres=gpu:1 # Request M GPU per node
#SBATCH --gres-flags=enforce-binding # CPU-GPU Affinity
#SBATCH --constraint="Ampere" # Request GPU Type: Volta(V100 or V100S) or RTX8000, Ampere
###
### The system will alloc 8 cores per gpu by default.
### If you need more or less, use following:
### #SBATCH --cpus-per-task=K # Request K cores
###
#SBATCH --qos=gpu-short # Request QOS Type
#- Operstions
echo "Job start at $(date "+%Y-%m-%d %H:%M:%S")"
echo "Job run at:"
echo "$(hostnamectl)"
#- Load environments
source /tools/module_env.sh
module list # list modules loaded by default
##- tools
module load cluster-tools/v1.0
module load cmake/3.15.7
module load git/2.17.1
module load vim/8.1.2424
##- language
module load python3/3.6.8
##- cuda
module load cuda-cudnn/11.0-8.0.4
##- virtualenv
# source xxxxx/activate
#- Log information
echo $(module list) # list modules loaded
echo $(which gcc)
echo $(which python)
echo $(which python3)
cluster-quota # nas quota
nvidia-smi --format=csv --query-gpu=name,driver_version,power.limit # gpu info
echo "Use GPU ${CUDA_VISIBLE_DEVICES}$" # which gpus
#- Warning! Please not change your CUDA_VISIBLE_DEVICES
#- in `.bashrc`, `env.sh`, or your job script
#- Job step
sleep 28800
#- End
echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
Using GPU 0.
==PROF== Connected to process 63252 (/workspace/S/songxinkai/projects/mytests/cuda/a.out)
==PROF== Profiling "add_kernel" - 1: 0%....50%....100% - 1 pass
add_kernel results: {0, 101, 202, 303, 404, 505, 606, 707, 808, 909}
==PROF== Disconnected from process 63252
[63252] a.out@127.0.0.1
void add_kernel<float>(int, float const*, float const*, float*), 2023-Feb-13 00:36:34, Context 1, Stream 7
Section: Command line profiler metrics
---------------------------------------------------------------------- --------------- ------------------------------
dram__bytes_read.avg Kbyte 20.58
dram__bytes_read.max Kbyte 24.58
dram__bytes_read.min Kbyte 18.43
dram__bytes_read.sum Kbyte 823.04
dram__bytes_write.avg byte 0
dram__bytes_write.max byte 0
dram__bytes_write.min byte 0
dram__bytes_write.sum byte 0
fbpa__dram_read_bytes.avg Kbyte 41.15
fbpa__dram_read_bytes.max Kbyte 45.06
fbpa__dram_read_bytes.min Kbyte 36.86
fbpa__dram_read_bytes.sum Kbyte 823.04
fbpa__dram_write_bytes.avg byte 0
fbpa__dram_write_bytes.max byte 0
fbpa__dram_write_bytes.min byte 0
fbpa__dram_write_bytes.sum byte 0
---------------------------------------------------------------------- --------------- ------------------------------
#!/bin/bash
#nvcc add.cu && \
#nv-nsight-cu-cli \
# a.out
# nvcc add.cu && \
# /tools/cluster-software/cuda-cudnn/cuda-11.1-8.0.5/nsight-compute/2020.2.1/nv-nsight-cu-cli \
# --target-processes all \
# a.out
# --metrics dram_read_bytes,dram_write_bytes,smsp__sass_thread_inst_executed_op_dadd_pred_on,smsp__sass_thread_inst_executed_op_dfma_pred_on,smsp__sass_thread_inst_executed_op_dmul_pred_on,smsp__sass_thread_inst_executed_op_hadd_pred_on,smsp__sass_thread_inst_executed_op_hfma_pred_on,smsp__sass_thread_inst_executed_op_hmul_pred_on,smsp__sass_thread_inst_executed_op_fadd_pred_on,smsp__sass_thread_inst_executed_op_ffma_pred_on,smsp__sass_thread_inst_executed_op_fmul_pred_on \
nvcc add.cu && \
/tools/cluster-software/cuda-cudnn/cuda-11.1-8.0.5/nsight-compute/2020.2.1/nv-nsight-cu-cli \
--metrics dram_read_bytes,dram__bytes_read,dram_write_bytes,dram__bytes_write \
a.out
......@@ -23,7 +23,7 @@ using std::endl;
} \
cout << vec[LEN - 1] << "}" << endl;
#define LEN 32
#define LEN 1000000
template <typename Dtype>
struct MAX_OP{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment