Commit 70bd660e by songxinkai

nsight example

parent fc4b5568
......@@ -5,3 +5,5 @@
*bin.finish
bazel-*
build
*.err
*.out
......@@ -8,7 +8,7 @@ using namespace std;
int main(){
time_t a = 0, b;
time(&a); // time.h
sleep(2); // unistd.h
sleep(2.11111); // unistd.h
time(&b);
cout << a << ", " << b << endl;
cout << b -a << endl;
......@@ -24,7 +24,7 @@ int main(){
cout << float(clc_b - clc_a) / CLOCKS_PER_SEC << endl;
clc_a=clock(); // time.h
sleep(2); // NO CPU clock during sleep
sleep(2.4); // NO CPU clock during sleep
clc_b=clock();
cout << clc_a << ", " << clc_b <<", CLOCKS_PER_SEC: " << CLOCKS_PER_SEC<< endl;
cout << (clc_b - clc_a) / CLOCKS_PER_SEC << endl;
......
......@@ -18,7 +18,7 @@ using std::endl;
} \
cout << vec[len - 1] << "}" << endl;
#define LEN 34
#define LEN 102400
// kernel functions
template<typename Dtype>
......@@ -33,9 +33,9 @@ __global__ void add_kernel(const int N, const Dtype* a, const Dtype* b, Dtype* c
int main(){
// host memory malloc & initial
int* host_a = new int[LEN];
int* host_b = new int[LEN];
int* host_c = new int[LEN];
float* host_a = new float[LEN];
float* host_b = new float[LEN];
float* host_c = new float[LEN];
for (int i = 0; i < LEN; ++i){
host_a[i] = i;
host_b[i] = i * 100;
......@@ -43,14 +43,14 @@ int main(){
}
// GPU device start
int device_id = 2;
int device_id = 0;
CUDA_CHECK(cudaSetDevice(device_id));
cout << "Using GPU " << device_id << "." << endl;
// cudaMalloc & cudaMemcpy & cudaMemset
int* dev_a;
int* dev_b;
int* dev_c;
float* dev_a;
float* dev_b;
float* dev_c;
CUDA_CHECK(cudaMalloc((void**)&dev_a, LEN * sizeof(int)));
CUDA_CHECK(cudaMalloc((void**)&dev_b, LEN * sizeof(int)));
CUDA_CHECK(cudaMalloc((void**)&dev_c, LEN * sizeof(int)));
......@@ -61,10 +61,9 @@ int main(){
// add_kernel & result copy & print
dim3 grid_dim(1, 1, 1); // gridDim.x, gridDim.y, gridDim.z (always 1)
dim3 block_dim(16, 1, 1); // blockDim.x, blockDim.y, blockDim.z
add_kernel<int><<<grid_dim, block_dim>>>(LEN, dev_a, dev_b, dev_c);
//add_kernel<<<1, 16>>>(LEN, dev_a, dev_b, dev_c); // Set gridDim.x & blockDim.x
add_kernel<float><<<grid_dim, block_dim>>>(LEN, dev_a, dev_b, dev_c);
CUDA_CHECK(cudaMemcpy(host_c, dev_c, LEN * sizeof(int), cudaMemcpyDeviceToHost));
VECTOR_PRINT("add_kernel results", host_c, LEN);
VECTOR_PRINT("add_kernel results", host_c, 10);
// Free gpu memory & free cpu memory
CUDA_CHECK(cudaFree(dev_a));
......
#include <cuda_runtime.h>
#include <iostream>
using std::cin;
using std::cout;
using std::endl;
#define CUDA_CHECK(x) \
{ cudaError_t cuda_error = x; \
if (cuda_error != cudaSuccess) \
cout << "cudaError_t: " << cuda_error << " != 0 " \
<< cudaGetErrorString(cuda_error) << endl; \
}
#define VECTOR_PRINT(head_str, vec, len) \
cout << head_str << ": {"; \
for (int i = 0; i < len - 1; ++i){ \
cout << vec[i] << ", "; \
} \
cout << vec[len - 1] << "}" << endl;
#define LEN 1000000
#define BLOCKDIM 512
#define GRIDDIM 80
// kernel function
__global__ void argmax_kernel(int N, int *a, int *c ) {
__shared__ int cache[BLOCKDIM]; // 512 >= thread_id in block
int tid = threadIdx.x + blockIdx.x * blockDim.x; // thread_id in grid
int cacheIndex = threadIdx.x; // thread_id in block
// thread_num in grid
int temp_maxidx = tid;
while (tid < N) {
if (a[tid] > a[temp_maxidx]){
temp_maxidx = tid;
}
tid += blockDim.x * gridDim.x;
}
cache[cacheIndex] = temp_maxidx;//if blockDim == 1, then result = the sum of cache[].
//同步
__syncthreads();//make sure that all the threads in a block finish the procedure above
//规约求和
int i = blockDim.x/2;
while (i != 0) {
if (cacheIndex < i) {
if (a[cache[cacheIndex + i]] > a[cache[cacheIndex]]){
cache[cacheIndex] = cache[cacheIndex + i];
}
}
__syncthreads();
i /= 2;
}
if (cacheIndex == 0) {
c[blockIdx.x] = cache[0];
}
}
int main() {
// host memory malloc & initial
int* host_a = new int[LEN];
int* host_c = new int[GRIDDIM];
for (int i = 0; i < LEN; ++i) {
host_a[i] = i;
}
for (int i = 0; i < GRIDDIM+1; ++i) {
host_c[i] = 0;
}
// GPU device start
int device_id = 1;
CUDA_CHECK(cudaSetDevice(device_id));
cout << "Using GPU " << device_id << "." << endl;
// cudaMalloc & cudaMemcpy & cudaMemset
int* dev_a;
int* dev_c;
CUDA_CHECK(cudaMalloc((void**)&dev_a, LEN * sizeof(int)));
CUDA_CHECK(cudaMalloc((void**)&dev_c, (GRIDDIM+1) * sizeof(int)));
cudaEvent_t start, end;
CUDA_CHECK(cudaEventCreate(&start));
CUDA_CHECK(cudaEventCreate(&end));
cudaEventRecord(start);
CUDA_CHECK(cudaMemcpy(dev_a, host_a, LEN * sizeof(int), cudaMemcpyHostToDevice));
cudaEventRecord(end);
cudaEventSynchronize(end);
// 统计时间
float time_ms = 0.f;
cudaEventElapsedTime(&time_ms, start, end);
std::cout << "CUDA Kernel time: " << time_ms << " ms" << std::endl;
CUDA_CHECK(cudaMemset(dev_c, 0, (GRIDDIM+1) * sizeof(int)));
// add_kernel & result copy & print
dim3 grid_dim(GRIDDIM, 1, 1); // gridDim.x, gridDim.y, gridDim.z
dim3 block_dim(BLOCKDIM, 1, 1); // blockDim.x, blockDim.y, blockDim.z
const int blocksPerGrid = grid_dim.x * grid_dim.y * grid_dim.z;
argmax_kernel<<<grid_dim, block_dim>>>(LEN, dev_a, dev_c);
CUDA_CHECK(cudaMemcpy(host_c, dev_c, (GRIDDIM+1) * sizeof(int), cudaMemcpyDeviceToHost));
VECTOR_PRINT("c", host_c, GRIDDIM);
// Free gpu memory & free cpu memory
CUDA_CHECK(cudaFree(dev_a));
CUDA_CHECK(cudaFree(dev_c));
delete[] host_a;
delete[] host_c;
return 0;
}
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without modification, are permitted
* provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright notice, this list of
* conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright notice, this list of
* conditions and the following disclaimer in the documentation and/or other materials
* provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the names of its contributors may be used
* to endorse or promote products derived from this software without specific prior written
* permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR
* IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND
* FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
* STRICT LIABILITY, OR TOR (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*//*
*/
/** @file common_device.h
* @author Thomas Müller & Nikolaus Binder, NVIDIA
* @brief Implementation of various miscellaneous CUDA kernels and
device functions.
*/
#pragma once
#include <tiny-cuda-nn/common.h>
#define _USE_MATH_DEFINES
#include <cmath>
#include <cuda_fp16.h>
#include <cassert>
#include <cstdint>
#include <cstdio>
#include <tiny-cuda-nn/gpu_matrix.h>
TCNN_NAMESPACE_BEGIN
static constexpr float PI = 3.14159265358979323846f;
static constexpr float SQRT2 = 1.41421356237309504880f;
__host__ __device__ inline float logistic(const float x) {
return 1.0f / (1.0f + expf(-x));
}
__host__ __device__ inline float logit(const float x) {
return -logf(1.0f / (fminf(fmaxf(x, 1e-9f), 1.0f - 1e-9f)) - 1.0f);
}
template <typename V>
struct VectorFragment {
static const uint32_t num_elements = V::N;
V x;
};
static constexpr float K_ACT = 10.0f;
template <typename T, typename fragment_t>
__host__ __device__ void warp_activation(Activation activation, const fragment_t& frag, fragment_t& result) {
switch (activation) {
case Activation::ReLU:
TCNN_PRAGMA_UNROLL
for (int t=0; t < result.num_elements; t++) {
result.x[t] = frag.x[t] * (T)((T)frag.x[t] > (T)0.0f);
}
return;
case Activation::Exponential:
TCNN_PRAGMA_UNROLL
for (int t=0; t < result.num_elements; t++) {
result.x[t] = (T)(expf((float)frag.x[t]));
}
return;
case Activation::Sine:
TCNN_PRAGMA_UNROLL
for (int t=0; t < result.num_elements; t++) {
result.x[t] = (T)(sinf((float)frag.x[t]));
}
return;
case Activation::Sigmoid:
TCNN_PRAGMA_UNROLL
for (int t=0; t < result.num_elements; t++) {
result.x[t] = (T)(logistic((float)frag.x[t]));
}
return;
case Activation::Squareplus:
TCNN_PRAGMA_UNROLL
for (int t=0; t < result.num_elements; t++) {
float x = (float)frag.x[t] * K_ACT;
result.x[t] = (T)(0.5f * (x + sqrtf(x * x + 4)) / K_ACT);
}
return;
case Activation::Softplus:
TCNN_PRAGMA_UNROLL
for (int t=0; t < result.num_elements; t++) {
result.x[t] = (T)(logf(expf((float)frag.x[t] * K_ACT) + 1.0f) / K_ACT);
}
return;
case Activation::None: result = frag; return;
default:
// Unsupported activation
// assert(false); // Commented out due to isolated strange side-effects on Windows
return;
}
}
template <typename T, typename fragment_t>
__host__ __device__ fragment_t warp_activation(Activation activation, const fragment_t& frag) {
fragment_t result;
warp_activation<T>(activation, frag, result);
return result;
}
template <typename T, typename fragment_t, typename forward_fragment_t>
__host__ __device__ void warp_activation_backward_in(Activation activation, const fragment_t& frag, const forward_fragment_t& forward_frag_in, fragment_t& result) {
switch (activation) {
case Activation::ReLU:
TCNN_PRAGMA_UNROLL
for (int t=0; t < result.num_elements; t++) {
result.x[t] = frag.x[t] * (T)(forward_frag_in.x[t] > (T)0.0f);
}
return;
case Activation::Exponential:
TCNN_PRAGMA_UNROLL
for (int t=0; t < result.num_elements; t++) {
result.x[t] = frag.x[t] * (T)(expf(forward_frag_in.x[t]));
}
return;
case Activation::Sine:
TCNN_PRAGMA_UNROLL
for (int t=0; t < result.num_elements; t++) {
result.x[t] = frag.x[t] * (T)(cosf(forward_frag_in.x[t]));
}
return;
case Activation::Sigmoid:
TCNN_PRAGMA_UNROLL
for (int t=0; t < result.num_elements; t++) {
float x = logistic(forward_frag_in.x[t]);
result.x[t] = frag.x[t] * (T)(x * (1.0f - x));
}
return;
case Activation::Squareplus:
TCNN_PRAGMA_UNROLL
for (int t=0; t < result.num_elements; t++) {
float x = (float)forward_frag_in.x[t] * K_ACT;
float y = 0.5f * (x + sqrtf(x * x + 4));
result.x[t] = frag.x[t] * (T)(y * y / (y * y + 1));
}
return;
case Activation::Softplus:
TCNN_PRAGMA_UNROLL
for (int t=0; t < result.num_elements; t++) {
float tmp = expf((float)frag.x[t] * K_ACT);
result.x[t] = frag.x[t] * (T)(tmp / (tmp + 1));
}
return;
case Activation::None: result = frag; return;
default:
// Unsupported activation
// assert(false); // Commented out due to isolated strange side-effects on Windows
return;
}
}
template <typename T, typename fragment_t, typename forward_fragment_t>
__host__ __device__ fragment_t warp_activation_backward_in(Activation activation, const fragment_t& frag, const forward_fragment_t& forward_frag_in) {
fragment_t result;
warp_activation_backward_in<T>(activation, frag, forward_frag_in, result);
return result;
}
template <typename T, typename fragment_t, typename forward_fragment_t>
__host__ __device__ void warp_activation_backward(Activation activation, const fragment_t& frag, const forward_fragment_t& forward_frag, fragment_t& result) {
switch (activation) {
case Activation::ReLU:
TCNN_PRAGMA_UNROLL
for (int t=0; t < result.num_elements; t++) {
result.x[t] = frag.x[t] * (T)(forward_frag.x[t] > (T)0.0f);
}
return;
case Activation::Exponential:
TCNN_PRAGMA_UNROLL
for (int t=0; t < result.num_elements; t++) {
result.x[t] = frag.x[t] * forward_frag.x[t];
}
return;
case Activation::Sine:
// Sine requires stored pre-activations, which we don't have. We only
// write out the post-activations.
// assert(false); // Commented out due to isolated strange side-effects on Windows
return;
case Activation::Sigmoid:
TCNN_PRAGMA_UNROLL
for (int t=0; t < result.num_elements; t++) {
result.x[t] = frag.x[t] * (T)(forward_frag.x[t] * ((T)1.0f - forward_frag.x[t]));
}
return;
case Activation::Squareplus:
TCNN_PRAGMA_UNROLL
for (int t=0; t < result.num_elements; t++) {
float y = (float)forward_frag.x[t] * K_ACT;
result.x[t] = frag.x[t] * (T)(y * y / (y * y + 1));
}
return;
case Activation::Softplus:
TCNN_PRAGMA_UNROLL
for (int t=0; t < result.num_elements; t++) {
result.x[t] = frag.x[t] * (T)(1.0f - expf(-(float)forward_frag.x[t] * K_ACT));
}
return;
case Activation::None: result = frag; return;
default:
// Unsupported activation
// assert(false); // Commented out due to isolated strange side-effects on Windows
return;
}
}
template <typename T, typename fragment_t, typename forward_fragment_t>
__host__ __device__ fragment_t warp_activation_backward(Activation activation, const fragment_t& frag, const forward_fragment_t& forward_frag) {
fragment_t result;
warp_activation_backward<T>(activation, frag, forward_frag, result);
return result;
}
template <typename T, uint32_t N>
using vector_fragment_t = VectorFragment<vector_t<T, N>>;
template <typename T, uint32_t N=1>
__global__ void kernel_activation(const uint32_t num_elements, const Activation act, const T* in, T* out) {
const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= num_elements) return;
auto frag = ((vector_fragment_t<T, N>*)in)[i];
warp_activation<T>(act, frag, frag);
((vector_fragment_t<T, N>*)out)[i] = frag;
}
// Transfer functions corresponding to activations; version without biases
template <typename T, uint32_t N=1>
__global__ void kernel_activation_backward(const uint32_t num_elements, const Activation act, const T* __restrict__ values, const T* gradients_out, T* gradients_in) {
const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= num_elements) return;
auto frag_forward_in = ((vector_fragment_t<T, N>*)values)[i];
auto frag = ((vector_fragment_t<T, N>*)gradients_out)[i];
warp_activation_backward_in<T>(act, frag, frag_forward_in, frag);
((vector_fragment_t<T, N>*)gradients_in)[i] = frag;
}
// Transfer functions corresponding to activations, given _output_ values. Only works if the activation is invertible
template <typename T, uint32_t N=1>
__global__ void kernel_activation_backward_output(const uint32_t num_elements, const Activation act, const T* __restrict__ output_values, const T* gradients_out, T* gradients_in) {
const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= num_elements) return;
auto frag_forward_out = ((vector_fragment_t<T, N>*)output_values)[i];
auto frag = ((vector_fragment_t<T, N>*)gradients_out)[i];
warp_activation_backward<T>(act, frag, frag_forward_out, frag);
((vector_fragment_t<T, N>*)gradients_in)[i] = frag;
}
template <typename T>
void activation_gpu(cudaStream_t stream, const uint32_t num_elements, const Activation act, const T* in, T* out) {
static constexpr uint32_t ACTIVATION_VECTOR_SIZE = 16u / sizeof(T);
if (num_elements % ACTIVATION_VECTOR_SIZE != 0) {
throw std::runtime_error{std::string{"activation_gpu: number of elements must be a multiple of "} + std::to_string(ACTIVATION_VECTOR_SIZE)};
}
// Activation::None is a noop
if (act == Activation::None && in == out) {
return;
}
linear_kernel(kernel_activation<T, ACTIVATION_VECTOR_SIZE>, 0, stream, div_round_up(num_elements, ACTIVATION_VECTOR_SIZE), act, in, out);
}
template <typename T>
void activation_gpu(cudaStream_t stream, Activation activation, const GPUMatrixDynamic<T>& input, GPUMatrixDynamic<T>& output) {
if (input.n() != output.n() || input.m() != output.m()) {
throw std::runtime_error(std::string{"Input and output don't have matching size: "} + std::to_string(input.n()) + "!=" + std::to_string(output.n()));
}
activation_gpu(stream, input.n_elements(), activation, input.data(), output.data());
}
template <typename T>
void activation_backward_gpu(cudaStream_t stream, const uint32_t num_elements, const Activation act, const T* __restrict__ values, const T* gradients_out, T* gradients_in) {
static constexpr uint32_t ACTIVATION_VECTOR_SIZE = 16u / sizeof(T);
if (num_elements % ACTIVATION_VECTOR_SIZE != 0) {
throw std::runtime_error{std::string{"activation_backward_gpu: number of elements must be a multiple of "} + std::to_string(ACTIVATION_VECTOR_SIZE)};
}
// Activation transfer is a noop for Activation::None
if (act == Activation::None && gradients_out == gradients_in) {
return;
}
linear_kernel(kernel_activation_backward<T, ACTIVATION_VECTOR_SIZE>, 0, stream, div_round_up(num_elements, ACTIVATION_VECTOR_SIZE), act, values, gradients_out, gradients_in);
}
template <typename T>
void activation_backward_gpu(cudaStream_t stream, Activation activation, const GPUMatrixDynamic<T>& values, GPUMatrixDynamic<T>& gradients) {
if (values.n() != gradients.n() || values.m() != gradients.m()) {
throw std::runtime_error(std::string("Values and gradients don't have matching size: ") + std::to_string(values.n()) + "!=" + std::to_string(gradients.n()));
}
activation_backward_gpu(stream, values.n_elements(), activation, values.data(), gradients.data(), gradients.data());
}
template <typename T>
void activation_backward_output_gpu(cudaStream_t stream, const uint32_t num_elements, const Activation act, const T* __restrict__ output_values, const T* gradients_out, T* gradients_in) {
static constexpr uint32_t ACTIVATION_VECTOR_SIZE = 16u / sizeof(T);
if (num_elements % ACTIVATION_VECTOR_SIZE != 0) {
throw std::runtime_error{std::string{"activation_backward_output_gpu: number of elements must be a multiple of "} + std::to_string(ACTIVATION_VECTOR_SIZE)};
}
// Activation transfer is a noop for Activation::None
if (act == Activation::None && gradients_out == gradients_in) {
return;
}
linear_kernel(kernel_activation_backward_output<T, ACTIVATION_VECTOR_SIZE>, 0, stream, div_round_up(num_elements, ACTIVATION_VECTOR_SIZE), act, output_values, gradients_out, gradients_in);
}
// Expands a 10-bit integer into 30 bits
// by inserting 2 zeros after each bit.
__host__ __device__ inline uint32_t expand_bits(uint32_t v) {
v = (v * 0x00010001u) & 0xFF0000FFu;
v = (v * 0x00000101u) & 0x0F00F00Fu;
v = (v * 0x00000011u) & 0xC30C30C3u;
v = (v * 0x00000005u) & 0x49249249u;
return v;
}
// Calculates a 30-bit Morton code for the
// given 3D point located within the unit cube [0,1].
__host__ __device__ inline uint32_t morton3D(uint32_t x, uint32_t y, uint32_t z) {
uint32_t xx = expand_bits(x);
uint32_t yy = expand_bits(y);
uint32_t zz = expand_bits(z);
return xx | (yy << 1) | (zz << 2);
}
__host__ __device__ inline uint32_t morton3D_invert(uint32_t x) {
x = x & 0x49249249;
x = (x | (x >> 2)) & 0xc30c30c3;
x = (x | (x >> 4)) & 0x0f00f00f;
x = (x | (x >> 8)) & 0xff0000ff;
x = (x | (x >> 16)) & 0x0000ffff;
return x;
}
__host__ __device__ inline uint64_t expand_bits(uint64_t w) {
w &= 0x00000000001fffff;
w = (w | w << 32) & 0x001f00000000ffff;
w = (w | w << 16) & 0x001f0000ff0000ff;
w = (w | w << 8) & 0x010f00f00f00f00f;
w = (w | w << 4) & 0x10c30c30c30c30c3;
w = (w | w << 2) & 0x1249249249249249;
return w;
}
__host__ __device__ inline uint64_t morton3D_64bit(uint32_t x, uint32_t y, uint32_t z) {
return ((expand_bits((uint64_t)x)) | (expand_bits((uint64_t)y) << 1) | (expand_bits((uint64_t)z) << 2));
}
__device__ inline float smoothstep(float val) {
return val*val*(3.0f - 2.0f * val);
}
__device__ inline float smoothstep_derivative(float val) {
return 6*val*(1.0f - val);
}
__device__ inline float identity_fun(float val) {
return val;
}
__device__ inline float identity_derivative(float val) {
return 1;
}
template <typename F, typename FPRIME>
__device__ inline void pos_fract(const float input, float* pos, float* pos_derivative, uint32_t* pos_grid, float scale, F interpolation_fun, FPRIME interpolation_fun_derivative) {
*pos = input * scale + 0.5f;
int tmp = floorf(*pos);
*pos_grid = (uint32_t)tmp;
*pos -= (float)tmp;
*pos_derivative = interpolation_fun_derivative(*pos);
*pos = interpolation_fun(*pos);
}
template <typename F>
__device__ inline void pos_fract(const float input, float* pos, uint32_t* pos_grid, float scale, F interpolation_fun) {
*pos = input * scale + 0.5f;
int tmp = floorf(*pos);
*pos_grid = (uint32_t)tmp;
*pos -= (float)tmp;
*pos = interpolation_fun(*pos);
}
__device__ inline float weight_decay(float relative_weight_decay, float absolute_weight_decay, float weight) {
// Relative weight decay is closely related to l2 regularization, whereas absolute weight decay corresponds to l1 regularization
return (1 - relative_weight_decay) * weight - copysignf(absolute_weight_decay, weight);
}
__device__ inline float gaussian_cdf(const float x, const float inv_radius) {
return normcdff(x * inv_radius);
}
__device__ inline float gaussian_cdf_approx(const float x, const float inv_radius) {
static constexpr float MAGIC_SIGMOID_FACTOR = 1.12f / SQRT2;
return logistic(MAGIC_SIGMOID_FACTOR * x * inv_radius);
}
__device__ inline float gaussian_cdf_approx_derivative(const float result, const float inv_radius) {
static constexpr float MAGIC_SIGMOID_FACTOR = 1.12f / SQRT2;
return result * (1 - result) * MAGIC_SIGMOID_FACTOR * inv_radius;
}
__device__ inline float gaussian_pdf(const float x, const float inv_radius) {
return inv_radius * rsqrtf(2.0f * PI) * expf(-0.5f * (x * x * inv_radius * inv_radius));
}
__device__ inline float gaussian_pdf_max_1(const float x, const float inv_radius) {
return expf(-0.5f * (x * x * inv_radius * inv_radius));
}
__device__ inline float tent(const float x, const float inv_radius) {
return fmaxf(1.0f - fabsf(x * inv_radius), 0.0f);
}
__device__ inline float tent_cdf(const float x, const float inv_radius) {
return fmaxf(0.0f, fminf(1.0f, x * inv_radius + 0.5f));
}
__device__ inline float quartic(const float x, const float inv_radius) {
const float u = x * inv_radius;
const float tmp = fmaxf(1 - u*u, 0.0f);
return ((float)15 / 16) * tmp * tmp;
}
__device__ inline float quartic_cdf_deriv(const float x, const float inv_radius) {
return quartic(x, inv_radius) * inv_radius;
}
__device__ inline float quartic_cdf(const float x, const float inv_radius) {
const float u = x * inv_radius;
const float u2 = u * u;
const float u4 = u2 * u2;
return fmaxf(0.0f, fminf(1.0f, ((float)15 / 16) * u * (1 - ((float)2 / 3) * u2 + ((float)1 / 5) * u4) + 0.5f));
}
__device__ inline uint32_t permute(uint32_t num, uint32_t size) {
const uint32_t A = 10002659; // Large prime number
const uint32_t B = 4234151;
return (num * A + B) % size;
}
template <typename T>
__global__ void shuffle(const uint32_t n_elements, const uint32_t stride, const uint32_t seed, const T* __restrict__ in, T* __restrict__ out) {
const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= n_elements * stride) return;
const uint32_t elem_id = i / stride;
const uint32_t member_id = i % stride;
out[i] = in[permute(elem_id ^ seed, n_elements) * stride + member_id];
}
template <typename T>
__global__ void fill_rollover(const uint32_t n_elements, const uint32_t stride, const uint32_t* n_input_elements_ptr, T* inout) {
const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
const uint32_t n_input_elements = *n_input_elements_ptr;
if (i < (n_input_elements * stride) || i >= (n_elements * stride) || n_input_elements == 0) return;
T result = inout[i % (n_input_elements * stride)];
inout[i] = result;
}
template <typename T>
__global__ void fill_rollover_and_rescale(const uint32_t n_elements, const uint32_t stride, const uint32_t* n_input_elements_ptr, T* inout) {
const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
const uint32_t n_input_elements = *n_input_elements_ptr;
if (i < (n_input_elements * stride) || i >= (n_elements * stride) || n_input_elements == 0) return;
T result = inout[i % (n_input_elements * stride)];
result = (T)((float)result * n_input_elements / n_elements);
inout[i] = result;
}
template <typename T1, typename T2, typename T3>
__global__ void add(const uint32_t num_elements, const T1* data_in_1, const T2* data_in_2, T3* data_out) {
const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= num_elements) return;
data_out[i] = (T3)((float)data_in_1[i] + (float)data_in_2[i]);
}
template <typename T>
__global__ void add(const uint32_t num_elements, const T* __restrict__ data_in, T* __restrict__ data_in_out)
{
const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= num_elements) return;
data_in_out[i] = data_in[i] + data_in_out[i];
}
template <typename T>
__global__ void trim(const uint32_t num_elements, const uint32_t stride, const uint32_t dims, const T* __restrict__ data_in, T* __restrict__ data_out)
{
const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= num_elements) return;
uint32_t idx = i % dims;
uint32_t elem = i / dims;
data_out[i] = data_in[elem * stride + idx];
}
template <typename T>
__global__ void trim_and_cast(const uint32_t num_elements, const uint32_t stride, const uint32_t dims, const T* __restrict__ data_in, float* __restrict__ data_out)
{
const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= num_elements) return;
uint32_t idx = i % dims;
uint32_t elem = i / dims;
data_out[i] = (float)data_in[elem * stride + idx];
}
template <typename T>
__global__ void cast(const uint32_t num_elements, const float* __restrict__ full_precision, T* __restrict__ target)
{
const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= num_elements) return;
target[i] = (T)full_precision[i];
}
template <typename T>
__global__ void cast_from(const uint32_t num_elements, const T* __restrict__ precision, float* __restrict__ full_precision)
{
const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= num_elements) return;
full_precision[i] = (float)precision[i];
}
template <typename T>
__global__ void extract_dimension_pos_neg_kernel(const uint32_t num_elements, const uint32_t dim, const uint32_t fan_in, const uint32_t fan_out, const T* __restrict__ encoded, float* __restrict__ output) {
const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= num_elements) return;
const uint32_t elem_idx = i / fan_out;
const uint32_t dim_idx = i % fan_out;
if (fan_out == 1) {
output[i] = (float)encoded[elem_idx * fan_in + dim];
return;
}
if (dim_idx == 0) {
output[i] = fmaxf(-(float)encoded[elem_idx * fan_in + dim], 0.0f);
} else if (dim_idx == 1) {
output[i] = fmaxf((float)encoded[elem_idx * fan_in + dim], 0.0f);
} else if (dim_idx == 2) {
output[i] = 0;
} else {
output[i] = 1;
}
}
template <typename T>
__global__ void mult_scalar_kernel(const uint32_t num_elements, T* __restrict__ inout, float factor) {
const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= num_elements) return;
inout[i] = (T)((float)inout[i] * factor);
}
template <typename T>
__global__ void mult_kernel(const uint32_t num_elements, const T* factor1, const T* factor2, T* result) {
const uint32_t i = threadIdx.x + blockIdx.x * blockDim.x;
if (i >= num_elements) return;
result[i] = factor1[i] * factor2[i];
}
TCNN_NAMESPACE_END
#!/bin/bash
nvcc \
-L/workspace/S/songxinkai/local/TensorRT-8.5.1.7/lib \
-I/workspace/S/songxinkai/local/TensorRT-8.5.1.7/include \
-I/tools/cluster-software/cuda-cudnn/cuda-11.1-8.0.5/include \
-lnvinfer \
tensorrt_cudastream_example0.cpp
#!/bin/bash
#- Job parameters
# (TODO)
# Please modify job name
#SBATCH -J test # The job name
#SBATCH -o ret-%j.out # Write the standard output to file named 'ret-<job_number>.out'
#SBATCH -e ret-%j.err # Write the standard error to file named 'ret-<job_number>.err'
#- Needed resources
# (TODO)
# Please modify your requirements
#SBATCH -p nv-gpu#,nv-gpu-hw # Submit to 'nv-gpu' and 'nv-gpu-hw' Partitiion
#SBATCH -t 0-8:00:00 # Run for a maximum time of 0 days, 12 hours, 00 mins, 00 secs
#SBATCH --nodes=1 # Request N nodes
#SBATCH --gres=gpu:1 # Request M GPU per node
#SBATCH --gres-flags=enforce-binding # CPU-GPU Affinity
#SBATCH --constraint="Ampere" # Request GPU Type: Volta(V100 or V100S) or RTX8000, Ampere
###
### The system will alloc 8 cores per gpu by default.
### If you need more or less, use following:
### #SBATCH --cpus-per-task=K # Request K cores
###
#SBATCH --qos=gpu-short # Request QOS Type
#- Operstions
echo "Job start at $(date "+%Y-%m-%d %H:%M:%S")"
echo "Job run at:"
echo "$(hostnamectl)"
#- Load environments
source /tools/module_env.sh
module list # list modules loaded by default
##- tools
module load cluster-tools/v1.0
module load cmake/3.15.7
module load git/2.17.1
module load vim/8.1.2424
##- language
module load python3/3.6.8
##- cuda
module load cuda-cudnn/11.0-8.0.4
##- virtualenv
# source xxxxx/activate
#- Log information
echo $(module list) # list modules loaded
echo $(which gcc)
echo $(which python)
echo $(which python3)
cluster-quota # nas quota
nvidia-smi --format=csv --query-gpu=name,driver_version,power.limit # gpu info
echo "Use GPU ${CUDA_VISIBLE_DEVICES}$" # which gpus
#- Warning! Please not change your CUDA_VISIBLE_DEVICES
#- in `.bashrc`, `env.sh`, or your job script
#- Job step
sleep 28800
#- End
echo "Job end at $(date "+%Y-%m-%d %H:%M:%S")"
Using GPU 0.
==PROF== Connected to process 63252 (/workspace/S/songxinkai/projects/mytests/cuda/a.out)
==PROF== Profiling "add_kernel" - 1: 0%....50%....100% - 1 pass
add_kernel results: {0, 101, 202, 303, 404, 505, 606, 707, 808, 909}
==PROF== Disconnected from process 63252
[63252] a.out@127.0.0.1
void add_kernel<float>(int, float const*, float const*, float*), 2023-Feb-13 00:36:34, Context 1, Stream 7
Section: Command line profiler metrics
---------------------------------------------------------------------- --------------- ------------------------------
dram__bytes_read.avg Kbyte 20.58
dram__bytes_read.max Kbyte 24.58
dram__bytes_read.min Kbyte 18.43
dram__bytes_read.sum Kbyte 823.04
dram__bytes_write.avg byte 0
dram__bytes_write.max byte 0
dram__bytes_write.min byte 0
dram__bytes_write.sum byte 0
fbpa__dram_read_bytes.avg Kbyte 41.15
fbpa__dram_read_bytes.max Kbyte 45.06
fbpa__dram_read_bytes.min Kbyte 36.86
fbpa__dram_read_bytes.sum Kbyte 823.04
fbpa__dram_write_bytes.avg byte 0
fbpa__dram_write_bytes.max byte 0
fbpa__dram_write_bytes.min byte 0
fbpa__dram_write_bytes.sum byte 0
---------------------------------------------------------------------- --------------- ------------------------------
usage: ncu [options] [program] [program-arguments]
General Options:
-h [ --help ] Print this help message.
-v [ --version ] Print the version number.
--mode arg (=launch-and-attach) Select the mode of interaction with the target application:
launch-and-attach
(launch and attach for profiling)
launch
(launch and suspend for later attach)
attach
(attach to launched application)
-p [ --port ] arg (=49152) Base port for connecting to target application
--max-connections arg (=64) Maximum number of ports for connecting to target application
Launch Options:
--check-exit-code arg (=1) Check the application exit code and print an error if it is different than 0.
If set, --replay-mode application will stop after the first pass if the exit
code is not 0.
--injection-path-32 arg (=../linux-desktop-glibc_2_11_3-x86)
Override the default path for the 32-bit injection libraries.
--injection-path-64 arg Override the default path for the 64-bit injection libraries.
--nvtx Enable NVTX support.
--support-32bit Support profiling processes launched from 32-bit applications.
--target-processes arg (=application-only)
Select the processes you want to profile:
application-only
(profile only the application process)
all
(profile the application and its child processes)
Attach Options:
--hostname arg Set hostname / ip address for connection target.
Profile Options:
--devices arg Specify the devices to enable profiling on, separated by comma. By default all
devices are enabled.
--kernel-id arg Set the identifier to use for matching the kernel to profile. The identifier is
of the format "context-id:stream-id:[name-operator:]kernel-name:invocation-nr".
Skip entries that shouldn't be matched, e.g. use "::foobar:2" to match the
second invocation of "foobar" in any context or stream. Use ":7:regex:^foo:" to
match any kernel in stream 7 beginning with "foo" (according to
--kernel-regex-base).
-k [ --kernel-regex ] arg Set the regex to use for matching the kernel name.
--kernel-regex-base arg (=function) Set the basis for --kernel-regex, and --kernel-id kernel-name:
function
demangled
mangled
-c [ --launch-count ] arg Limit the number of collected profile results. The count is only incremented
for launches that match the kernel filters.
-s [ --launch-skip ] arg (=0) Set the number of kernel launches to skip before starting to profile. The count
is incremented for launches that match the kernel filters only.
--launch-skip-before-match arg (=0) Set the number of kernel launches to skip before starting to profile. The count
is incremented for all launches.
--kill arg (=0) Terminate the target application when the requested --launch-count was
profiled.
--replay-mode arg (=kernel) Mechanism used for replaying a kernel launch multiple times to collect all
requested profiling data:
kernel (default)
(Replay individual kernel launches transparently
during the execution of the application.)
application
(Relaunch the entire application multiple times.
Requires deterministic program execution.)
--list-sets List all section sets found in the search paths.
--set arg Identifier of section set to collect. If not specified, the default set is
collected.
--list-sections List all sections found in the search paths.
--section arg Identifier of section to collect. Regex allows matching full section
identifier. If not specified, the default section set is collected.
--section-folder arg Search path for section files. Not recursive.
--section-folder-recursive arg Search path for section files. Recursive.
--list-rules List all analysis rules found in the search paths.
--apply-rules arg (=1) Apply analysis rules for collected sections. If --rule is not set, all
available rules are applied. Allowed values:
on/off
yes/no
--rule arg Identifier of rule to apply. Enables --apply-rules yes.
--list-metrics List all metrics to be collected based on selected sections.
--query-metrics Query available metrics for devices on the system. Use --devices and --chips to
filter which devices to query. By default, metrics reported by this option
require a suffix to be collected. See --query-metrics-mode for details.
--query-metrics-mode arg (=base) Set the mode for querying metrics. Implies --query-metrics.
Available modes:
base (default)
(base names for metrics)
suffix
(suffix names for metrics. Use --metrics to specify the base metrics to
query)
all
(full names for metrics)
--metrics arg Specify all metrics to be profiled, separated by comma.
Names passed to this option support the following prefixes:
regex:<expression> Expands to all metrics that partially match the
expression. Enclose the regular expression in
^...$ to force a full match.
group:<name> Lists all metrics of the metric group with that
name. See section files for valid group names.
breakdown:<metric> Expands to the input metrics of the high-level
throughput metric. If the specified metric does
not support a breakdown, no metrics are added.
If a metric requires a suffix to be valid, and no prefix is used this option
automatically expands the name to all available first-level sub-metrics.
--list-chips List all supported chips that can be used with --chips.
--chips arg Specify the chips for querying metrics, separated by comma.
--profile-from-start arg (=1) Set if application should be profiled from its start. Allowed values:
on/off
yes/no
--disable-profiler-start-stop Disable start/stop profiling. When set, cu(da)ProfilerStart/Stop APIs are
ignored.
--quiet Suppress all profiling output.
--cache-control arg (=all) Control the behavior of the GPU caches during profiling. Allowed values:
all
none
--clock-control arg (=base) Control the behavior of the GPU clocks during profiling. Allowed values:
base
none
--nvtx-include arg Adds include statement to the NVTX filter, which allows selecting kernels to
profile based on NVTX ranges.
--nvtx-exclude arg Adds exclude statement to the NVTX filter, which allows selecting kernels to
profile based on NVTX ranges.
Sampling Options:
--sampling-interval arg (=auto) Set the sampling period in the range of [0..31]. Actual frequency is 2 ^ (5 +
value) cycles. If set to 'auto', the profiler tries to automatically determine
a high sampling frequency without skipping samples or overflowing the output
buffer.
--sampling-max-passes arg (=5) Set maximum number of passes used for sampling.
--sampling-buffer-size arg (=33554432)
Set the size of the device-sided allocation for samples in bytes.
File Options:
--log-file arg Send all tool output to the specified file, or
one of the standard channels. The file will be overwritten.
If the file doesn't exist, a new one will be created.
"stdout" as the whole file name indicates standard output
channel (stdout). (default)
"stderr" as the whole file name indicates standard error
channel (stderr).
-o [ --export ] arg Set the output file for writing the profile results. If not set, a temporary
file will be used which is removed afterwards.
-f [ --force-overwrite ] Force overwriting all output files (any existing files will be overwritten).
-i [ --import ] arg Set the input file for reading profile results.
--open-in-ui Open report in UI instead of showing result on terminal.
Console Output Options:
--csv Use comma-separated values in the output. Implies --print-units base by
default.
--page arg Select report page to output:
details: sections and rules
raw: all collected metrics
--details-all Include all section metrics on details page.
--print-units arg (=auto) Set scaling of metric units. Allowed values:
auto (default)
(Scale metrics to fitting magnitude)
base
(Show metrics with their base unit)
Replaces deprecated option --units.
--print-fp Show all numeric metrics as floating point numbers. Replaces deprecated option
--fp.
--print-metric-instances arg (=none) Set output mode for metrics with instance values:
none (default)
(Only show GPU aggregate value)
values
(Show GPU aggregate followed by all instance values)
--print-summary arg (=none) Set the summary output mode:
none
per-gpu
per-kernel
per-nvtx
Replaces deprecated option --summary.
--print-kernel-base arg (=demangled) Set the basis for kernel name output. See --kernel-regex-base for options.
Replaces deprecated option --kernel-base.
Use the --mode switch to select how to use the tool:
Launch and profile a Cuda application:
ncu CuVectorAdd
Launch an application for later attach:
ncu --mode=launch MyApp
Attach to a previously launched application:
ncu --mode=attach --hostname 127.0.0.1
Applications can also be launched or attached-to with the graphical user interface.
Select specific launches for profiling:
Profile first two launches of kernel 'foo':
ncu -k foo -c 2 CuVectorAdd
Load an existing report:
ncu --import myReport
Usage of --nvtx-include and --nvtx-exclude:
ncu --nvtx --nvtx-include "Domain A@Range A"
Profile kernels wrapped inside start/end range 'Range A' of 'Domain A'
ncu --nvtx --nvtx-exclude "Range A]"
Profile all kernels except kernels wrapped inside push/pop range 'Range A' of <default domain> at the top of the stack.
ncu --nvtx --nvtx-include "Range A" --nvtx-exclude "Range B"
Profile kernels wrapped inside start/end range 'Range A' but not inside 'Range B' of <default domain>
#!/bin/bash
#nvcc add.cu && \
#nv-nsight-cu-cli \
# a.out
# nvcc add.cu && \
# /tools/cluster-software/cuda-cudnn/cuda-11.1-8.0.5/nsight-compute/2020.2.1/nv-nsight-cu-cli \
# --target-processes all \
# a.out
# --metrics dram_read_bytes,dram_write_bytes,smsp__sass_thread_inst_executed_op_dadd_pred_on,smsp__sass_thread_inst_executed_op_dfma_pred_on,smsp__sass_thread_inst_executed_op_dmul_pred_on,smsp__sass_thread_inst_executed_op_hadd_pred_on,smsp__sass_thread_inst_executed_op_hfma_pred_on,smsp__sass_thread_inst_executed_op_hmul_pred_on,smsp__sass_thread_inst_executed_op_fadd_pred_on,smsp__sass_thread_inst_executed_op_ffma_pred_on,smsp__sass_thread_inst_executed_op_fmul_pred_on \
nvcc add.cu && \
/tools/cluster-software/cuda-cudnn/cuda-11.1-8.0.5/nsight-compute/2020.2.1/nv-nsight-cu-cli \
--metrics dram_read_bytes,dram__bytes_read,dram_write_bytes,dram__bytes_write \
a.out
#include "cuda_runtime.h"
#include "cuda/include/cuda_runtime_api.h"
#include "tensorrt/include/NvInfer.h"
#include "tensorrt/include/NvUffParser.h"
#include "cuda_runtime.h"
#include "cuda_runtime_api.h"
#include "NvInfer.h"
//#include "tensorrt/include/NvUffParser.h"
// CUDA: various checks for different function calls.
#define CUDA_CHECK(condition) \
/* Code block avoids redefinition of cudaError_t error */ \
do { \
cudaError_t error = condition; \
CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
} while (0)
#define MAX_WORKSPACE (1 << 30)
/// CUDA: various checks for different function calls.
// #define CUDA_CHECK(condition) \
// /* Code block avoids redefinition of cudaError_t error */ \
// do { \
// cudaError_t error = condition; \
// CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \
// } while (0)
// #define MAX_WORKSPACE (1 << 30)
int main(){
int m_gpu = 0; // GPU id
int m_mebs = 512; // most efficient batch size
bool m_half_precision = true; // whether use half_precision or single
std::vector<void*> m_cuda_buf_dev; // device momery
std::vector<float*> m_cuda_buf_host; // page-locked host memory
// int m_gpu = 0; // GPU id
// int m_mebs = 512; // most efficient batch size
// bool m_half_precision = true; // whether use half_precision or single
// std::vector<void*> m_cuda_buf_dev; // device momery
// std::vector<float*> m_cuda_buf_host; // page-locked host memory
cudaStream_t stream[2];
nvinfer1::IBuilder* builder;
nvinfer1::INetworkDefinition* network;
nvuffparser::IUffParser* parser;
nvinfer1::ICudaEngine *m_engine;
nvinfer1::IExecutionContext *m_context;
// cudaStream_t stream[2];
// nvinfer1::IBuilder* builder;
// nvinfer1::INetworkDefinition* network;
// nvuffparser::IUffParser* parser;
// nvinfer1::ICudaEngine *m_engine;
// nvinfer1::IExecutionContext *m_context;
// set device
cudaSetDevice(m_gpu);
// cudaSetDevice(m_gpu);
// cuda stream
CUDA_CHECK(cudaStreamCreate(&stream[0]));
CUDA_CHECK(cudaStreamCreate(&stream[1]));
// CUDA_CHECK(cudaStreamCreate(&stream[0]));
// CUDA_CHECK(cudaStreamCreate(&stream[1]));
// uff parser
parser = nvuffparser::createUffParser();
// parser = nvuffparser::createUffParser();
// register Inputs and Outputs
parser->registerInput("pos_tensor", nvinfer1::DimsCHW(GoFeature::SIZE_HISTORYEACHSIDE+1, GoComm::BORDER_SIZE, GoComm::BORDER_SIZE), nvuffparser::UffInputOrder::kNCHW);
parser->registerOutput("policy_output");
parser->registerOutput("value_output");
builder = nvinfer1::createInferBuilder(g_logger);
network = builder->createNetwork();
if (m_half_precision){
if (!parser->parse(uff_path.string().c_str(), *network, nvinfer1::DataType::kHALF)){
PLOG(ERROR) << "fail to parse";
return -1;
}
builder->setFp16Mode(true);
} else {
if (!parser->parse(uff_path.string().c_str(), *network, nvinfer1::DataType::kFLOAT)){
PLOG(ERROR) << "fail to parse";
return -1;
}
}
builder->setMaxBatchSize(m_mebs);
builder->setMaxWorkspaceSize(MAX_WORKSPACE);
m_engine = builder->buildCudaEngine(*network);
if (m_engine == nullptr) {
PLOG(ERROR) << "load cuda engine error";
return -2;
}
m_context = m_engine->createExecutionContext();
cout << "Model Loaded" << endl;
// parser->registerInput("pos_tensor", nvinfer1::DimsCHW(GoFeature::SIZE_HISTORYEACHSIDE+1, GoComm::BORDER_SIZE, GoComm::BORDER_SIZE), nvuffparser::UffInputOrder::kNCHW);
// parser->registerOutput("policy_output");
// parser->registerOutput("value_output");
// builder = nvinfer1::createInferBuilder(g_logger);
// network = builder->createNetwork();
// if (m_half_precision){
// if (!parser->parse(uff_path.string().c_str(), *network, nvinfer1::DataType::kHALF)){
// PLOG(ERROR) << "fail to parse";
// return -1;
// }
// builder->setFp16Mode(true);
// } else {
// if (!parser->parse(uff_path.string().c_str(), *network, nvinfer1::DataType::kFLOAT)){
// PLOG(ERROR) << "fail to parse";
// return -1;
// }
// }
// builder->setMaxBatchSize(m_mebs);
// builder->setMaxWorkspaceSize(MAX_WORKSPACE);
// m_engine = builder->buildCudaEngine(*network);
// if (m_engine == nullptr) {
// PLOG(ERROR) << "load cuda engine error";
// return -2;
// }
// m_context = m_engine->createExecutionContext();
// cout << "Model Loaded" << endl;
/// allocate and bind
for (int i = 0; i < m_engine->getNbBindings(); ++i) { // number of binding tensors
auto dim = m_engine->getBindingDimensions(i);
std::string dim_str = "(";
int size = 1;
for (int i = 0; i < dim.nbDims; ++i) {
if (i) dim_str += ", ";
dim_str += std::to_string(dim.d[i]);
size *= dim.d[i];
}
dim_str += ")";
LOG(INFO) << "tensorrt binding: " << m_engine->getBindingName(i) << " " << dim_str;
// /// allocate and bind
// for (int i = 0; i < m_engine->getNbBindings(); ++i) { // number of binding tensors
// auto dim = m_engine->getBindingDimensions(i);
// std::string dim_str = "(";
// int size = 1;
// for (int i = 0; i < dim.nbDims; ++i) {
// if (i) dim_str += ", ";
// dim_str += std::to_string(dim.d[i]);
// size *= dim.d[i];
// }
// dim_str += ")";
// LOG(INFO) << "tensorrt binding: " << m_engine->getBindingName(i) << " " << dim_str;
void *dev_buf;
CUDA_CHECK(cudaMalloc(&dev_buf, max_batch_size * size * sizeof(float)));
m_cuda_buf_dev.push_back(dev_buf);
float *host_buf;
CUDA_CHECK(cudaHostAlloc(&host_buf, max_batch_size * size * sizeof(float), cudaHostAllocDefault));
m_cuda_buf_host.push_back(host_buf);
}
// void *dev_buf;
// CUDA_CHECK(cudaMalloc(&dev_buf, max_batch_size * size * sizeof(float)));
// m_cuda_buf_dev.push_back(dev_buf);
// float *host_buf;
// CUDA_CHECK(cudaHostAlloc(&host_buf, max_batch_size * size * sizeof(float), cudaHostAllocDefault));
// m_cuda_buf_host.push_back(host_buf);
// }
// write pinned host memory
for (int i = 0; i < batch_size; ++i) {
for (int j = 0; j < INPUT_DIM; ++j) {
m_cuda_buf_host[0][i * INPUT_DIM + j] = inputs[i][j];
}
}
// // write pinned host memory
// for (int i = 0; i < batch_size; ++i) {
// for (int j = 0; j < INPUT_DIM; ++j) {
// m_cuda_buf_host[0][i * INPUT_DIM + j] = inputs[i][j];
// }
// }
void* bindings[3];
int batch_size_kernel;
bindings[0] = m_cuda_buf_dev[0];
bindings[1] = m_cuda_buf_dev[1];
bindings[2] = m_cuda_buf_dev[2];
for (int i = 0; i < batch_size; i += m_mebs){
batch_size_kernel = batch_size - i < m_mebs ? batch_size - i : m_mebs;
if (i % (m_mebs*2) < m_mebs){
CUDA_CHECK(cudaMemcpyAsync(m_cuda_buf_dev[0] +i*INPUT_DIM*sizeof(float), m_cuda_buf_host[0] +i*INPUT_DIM, batch_size_kernel*INPUT_DIM*sizeof(float), cudaMemcpyHostToDevice, stream[0]));
m_context->enqueue(batch_size_kernel, bindings, stream[0], nullptr);
CUDA_CHECK(cudaMemcpyAsync(m_cuda_buf_host[1] +i*OUTPUT_DIM, m_cuda_buf_dev[1] +i*OUTPUT_DIM*sizeof(float), batch_size_kernel*OUTPUT_DIM*sizeof(float), cudaMemcpyDeviceToHost, stream[0]));
CUDA_CHECK(cudaMemcpyAsync(m_cuda_buf_host[2] +i, m_cuda_buf_dev[2] +i*sizeof(float), batch_size_kernel*sizeof(float), cudaMemcpyDeviceToHost, stream[0]));
} else {
CUDA_CHECK(cudaMemcpyAsync(m_cuda_buf_dev[0] +i*INPUT_DIM*sizeof(float), m_cuda_buf_host[0] +i*INPUT_DIM, batch_size_kernel*INPUT_DIM*sizeof(float), cudaMemcpyHostToDevice, stream[1]));
m_context->enqueue(batch_size_kernel, bindings, stream[1], nullptr);
CUDA_CHECK(cudaMemcpyAsync(m_cuda_buf_host[1] +i*OUTPUT_DIM, m_cuda_buf_dev[1] +i*OUTPUT_DIM*sizeof(float), batch_size_kernel*OUTPUT_DIM*sizeof(float), cudaMemcpyDeviceToHost, stream[1]));
CUDA_CHECK(cudaMemcpyAsync(m_cuda_buf_host[2] +i, m_cuda_buf_dev[2] +i*sizeof(float), batch_size_kernel*sizeof(float), cudaMemcpyDeviceToHost, stream[1]));
}
bindings[0] += batch_size_kernel * INPUT_DIM * sizeof(float);
bindings[1] += batch_size_kernel * OUTPUT_DIM * sizeof(float);
bindings[2] += batch_size_kernel * sizeof(float);
}
CUDA_CHECK(cudaStreamSynchronize(stream[0]));
CUDA_CHECK(cudaStreamSynchronize(stream[1]));
// void* bindings[3];
// int batch_size_kernel;
// bindings[0] = m_cuda_buf_dev[0];
// bindings[1] = m_cuda_buf_dev[1];
// bindings[2] = m_cuda_buf_dev[2];
// for (int i = 0; i < batch_size; i += m_mebs){
// batch_size_kernel = batch_size - i < m_mebs ? batch_size - i : m_mebs;
// if (i % (m_mebs*2) < m_mebs){
// CUDA_CHECK(cudaMemcpyAsync(m_cuda_buf_dev[0] +i*INPUT_DIM*sizeof(float), m_cuda_buf_host[0] +i*INPUT_DIM, batch_size_kernel*INPUT_DIM*sizeof(float), cudaMemcpyHostToDevice, stream[0]));
// m_context->enqueue(batch_size_kernel, bindings, stream[0], nullptr);
// CUDA_CHECK(cudaMemcpyAsync(m_cuda_buf_host[1] +i*OUTPUT_DIM, m_cuda_buf_dev[1] +i*OUTPUT_DIM*sizeof(float), batch_size_kernel*OUTPUT_DIM*sizeof(float), cudaMemcpyDeviceToHost, stream[0]));
// CUDA_CHECK(cudaMemcpyAsync(m_cuda_buf_host[2] +i, m_cuda_buf_dev[2] +i*sizeof(float), batch_size_kernel*sizeof(float), cudaMemcpyDeviceToHost, stream[0]));
// } else {
// CUDA_CHECK(cudaMemcpyAsync(m_cuda_buf_dev[0] +i*INPUT_DIM*sizeof(float), m_cuda_buf_host[0] +i*INPUT_DIM, batch_size_kernel*INPUT_DIM*sizeof(float), cudaMemcpyHostToDevice, stream[1]));
// m_context->enqueue(batch_size_kernel, bindings, stream[1], nullptr);
// CUDA_CHECK(cudaMemcpyAsync(m_cuda_buf_host[1] +i*OUTPUT_DIM, m_cuda_buf_dev[1] +i*OUTPUT_DIM*sizeof(float), batch_size_kernel*OUTPUT_DIM*sizeof(float), cudaMemcpyDeviceToHost, stream[1]));
// CUDA_CHECK(cudaMemcpyAsync(m_cuda_buf_host[2] +i, m_cuda_buf_dev[2] +i*sizeof(float), batch_size_kernel*sizeof(float), cudaMemcpyDeviceToHost, stream[1]));
// }
// bindings[0] += batch_size_kernel * INPUT_DIM * sizeof(float);
// bindings[1] += batch_size_kernel * OUTPUT_DIM * sizeof(float);
// bindings[2] += batch_size_kernel * sizeof(float);
// }
// CUDA_CHECK(cudaStreamSynchronize(stream[0]));
// CUDA_CHECK(cudaStreamSynchronize(stream[1]));
// destroy
network->destroy();
builder->destroy();
parser->destroy();
m_context->destroy();
m_engine->destroy();
for (auto buf: m_cuda_buf_dev) {
CUDA_CHECK(cudaFree(buf));
}
for (auto buf: m_cuda_buf_host) {
CUDA_CHECK(cudaFreeHost(buf));
}
CUDA_CHECK(cudaStreamDestroy(stream[0]));
CUDA_CHECK(cudaStreamDestroy(stream[1]));
cout << "Model deleted" << endl;
// // destroy
// network->destroy();
// builder->destroy();
// parser->destroy();
// m_context->destroy();
// m_engine->destroy();
// for (auto buf: m_cuda_buf_dev) {
// CUDA_CHECK(cudaFree(buf));
// }
// for (auto buf: m_cuda_buf_host) {
// CUDA_CHECK(cudaFreeHost(buf));
// }
// CUDA_CHECK(cudaStreamDestroy(stream[0]));
// CUDA_CHECK(cudaStreamDestroy(stream[1]));
// cout << "Model deleted" << endl;
return 0;
}
......@@ -23,7 +23,7 @@ using std::endl;
} \
cout << vec[LEN - 1] << "}" << endl;
#define LEN 32
#define LEN 1000000
template <typename Dtype>
struct MAX_OP{
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment