Commit 1801e601 by songxinkai

20190103, add cuda_examples, add vector examples

parent d980982a
*core
*a.out
......@@ -64,7 +64,7 @@ void write(const string file){
}
int main (){
write("test.bin");
//write("test.bin");
vector<vector<float> > feas;
vector<vector<float> > pis;
vector<float> vs;
......
line1, 1, 2
line2, 1, 2
line1, 1, 2
line2, 1, 2
No preview for this file type
#include <iostream>
#include <utility>
#include <thread>
#include <chrono>
#include <functional>
#include <atomic>
void f1(int n)
{
......@@ -13,12 +10,13 @@ void f1(int n)
}
}
void f2(int n)
void f2(int& n)
{
for (int i = 0; i < n; ++i) {
std::cout << "Thread 2: " << i << std::endl;
std::this_thread::sleep_for(std::chrono::milliseconds(10));
}
n ++;
}
int main()
......@@ -26,9 +24,9 @@ int main()
int n = 100;
std::thread t1; // t1 is not a thread
std::thread t2(f1, n); // pass by value
std::thread t3(f2, n); // pass by reference
std::thread t3(f2, std::ref(n)); // pass by reference
std::thread t4(std::move(t3)); // t4 is now running f2(). t3 is no longer a thread
std::thread t5(f2, n); // pass by reference
std::thread t5(f2, std::ref(n)); // pass by reference
t2.join();
t4.join();
t5.join();
......
#include <iostream>
#include <thread>
#include <vector>
#include <atomic>
#include <mutex>
int main()
{
int n = 0;
int a = 1;
std::cout << "Final value of n is " << n << '\n';
using namespace std;
mutex mtx;
void at_fn1 (atomic<int>* a, int N){
for (int i = 0; i < N; ++i){
(*a) ++;
}
}
void mtx_fn2(vector<int> a) {
mtx.lock();
a.resize();
mtx.unlock();
}
int main(){
int N = 10, M = 100;
vector<thread> my_ths;
atomic<int> at_a(1);
for (int i = 0; i < N; ++i){
my_ths.emplace_back(at_fn1, &at_a, 10000);
}
for (auto &th: my_ths){
th.join();
}
cout << at_a.load() << endl;
return 0;
}
......@@ -24,4 +24,3 @@ int main(){
cout << at_a.load() << endl;
return 0;
}
//#include <thread>
#include <functional>
void fn(int& a){a ++;}
int main (){
int a = 0;
std::ref(a) ++;
//std::thread t(fn, std::ref(a));
//t.join();
return 0;
}
No preview for this file type
......@@ -16,12 +16,12 @@ int main(){
clock_t clc_a = 0, clc_b = 0;
vector<double> v;
clc_a=clock(); // time.h
for (int i = 0; i < 100000000; ++i){
for (int i = 0; i < 10000000; ++i){
v.push_back(double(i)*i);
}
clc_b=clock();
cout << clc_a << ", " << clc_b <<", CLOCKS_PER_SEC: " << CLOCKS_PER_SEC<< endl;
cout << (clc_b - clc_a) / CLOCKS_PER_SEC << endl;
cout << float(clc_b - clc_a) / CLOCKS_PER_SEC << endl;
clc_a=clock(); // time.h
sleep(2); // NO CPU clock during sleep
......
No preview for this file type
/*
g++ multithreads_vector_correct_example.cpp -o multithreads_vector_correct -std=c++11 -pthread
*/
#include <iostream>
#include <vector>
#include <unistd.h>
#include <thread>
#include <time.h>
using namespace std;
void fn(vector<vector<float> >& vec){
while(true){
int len = vec.size();
if (len > 0 && vec[len-1].size()>0){
cout << "thread 1, vec.size() = " << len;
cout << ", begin addr: " << &*(vec.begin());
cout << ", vec[-1][0] = " << vec[len-1][0] << endl;
}
sleep(0.2);
}
}
int main(){
vector<vector<float> > b;
int N = 100000, M = 19*19*17;
b.resize(N);
for (int i = 0; i < N; ++i){
b[i].resize(M);
}
cout << "b init done." << endl;
thread t1(fn, ref(b));
for (int i = 0; i < N; ++i){
for (int j = 0; j < M; ++j){
b[i][j] = float(j);
}
}
t1.join();
return 0;
}
/*
g++ multithreads_vector_wrong_example.cpp -o multithreads_vector_wrong -std=c++11 -pthread
*/
#include <iostream>
#include <vector>
#include <unistd.h>
#include <thread>
#include <time.h>
using namespace std;
void fn(vector<vector<float> >& vec){
while(true){
int len = vec.size();
if (len > 0 && vec[len-1].size()>0){
cout << "thread 1, vec.size() = " << len;
cout << ", begin addr: " << &*(vec.begin());
cout << ", vec[-1][0] = " << vec[len-1][0] << endl;
}
sleep(0.2);
}
}
int main(){
vector<vector<float> > b;
thread t1(fn, ref(b));
int N = 4000000, M = 19*19*17;
for (int i = 0; i < N; ++i){
vector<float> fea(M, i);
b.push_back(fea);
if ((int)b.size() == (int)b.capacity()){
cout << &*(b.begin()) << ", " << &*(b.begin()+1) << ", " << &*(b[0].begin()) << ", " << &*(b[0].begin()+1) << ", capacity grow: " << (int)b.capacity() << endl;
}
}
t1.join();
return 0;
}
/*
g++ size_capacity.cpp -o size_capacity -pthread -std=c++11
*/
#include <iostream>
#include <vector>
#include <time.h>
#include <math.h>
using namespace std;
int main(){
vector<long long> vec;
clock_t clc_0 = 0, clc_1 = 0;
for (int i = 0; i < 100; ++i){
vec.resize(pow(2, i+20)*10);
cout << "old begin: " << &*(vec.begin());
cout << ", old capacity: " << (long long)vec.capacity();
clc_0 = clock();
vec.push_back(0);
clc_1 = clock();
cout << ", new begin: " << &*(vec.begin());
cout << ", push_back time: " << float(clc_1 - clc_0) / CLOCKS_PER_SEC;
cout << ", size: " << (long long)vec.size();
cout << ", capacity: " << (long long)vec.capacity() << endl;
}
return 0;
}
#!/usr/bin/python
import os
import time
while True:
os.system("free -h")
time.sleep(0.2)
#include <cuda_runtime.h>
#include <iostream>
using std::cout;
using std::endl;
#define CUDA_CHECK(x) \
{ cudaError_t cuda_error = x; \
if (cuda_error != cudaSuccess) \
cout << "cudaError_t: " << cuda_error << " != 0 " \
<< cudaGetErrorString(cuda_error) << endl; \
}
#define VECTOR_PRINT(head_str, vec, len) \
cout << head_str << ": {"; \
for (int i = 0; i < len - 1; ++i){ \
cout << vec[i] << ", "; \
} \
cout << vec[len - 1] << "}" << endl;
#define LEN 34
// kernel functions
template<typename Dtype>
__global__ void add_kernel(const int N, const Dtype* a, const Dtype* b, Dtype* c){
int i = threadIdx.x; // thread index in block
// c[i] = a[i] + b[i];
for (int i = threadIdx.x; i < N; i += gridDim.x * blockDim.x){
c[i] = a[i] + b[i];
}
}
int main(){
// host memory malloc & initial
int* host_a = new int[LEN];
int* host_b = new int[LEN];
int* host_c = new int[LEN];
for (int i = 0; i < LEN; ++i){
host_a[i] = i;
host_b[i] = i * 100;
host_c[i] = -1;
}
// GPU device start
int device_id = 1;
CUDA_CHECK(cudaSetDevice(device_id));
cout << "Using GPU " << device_id << "." << endl;
// cudaMalloc & cudaMemcpy & cudaMemset
int* dev_a;
int* dev_b;
int* dev_c;
CUDA_CHECK(cudaMalloc((void**)&dev_a, LEN * sizeof(int)));
CUDA_CHECK(cudaMalloc((void**)&dev_b, LEN * sizeof(int)));
CUDA_CHECK(cudaMalloc((void**)&dev_c, LEN * sizeof(int)));
CUDA_CHECK(cudaMemcpy(dev_a, host_a, LEN * sizeof(int), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(dev_b, host_b, LEN * sizeof(int), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemset(dev_c, 0, LEN * sizeof(int))); // Set value by byte
// add_kernel & result copy & print
dim3 grid_dim(1, 1, 1); // gridDim.x, gridDim.y, gridDim.z (always 1)
dim3 block_dim(16, 1, 1); // blockDim.x, blockDim.y, blockDim.z
add_kernel<int><<<grid_dim, block_dim>>>(LEN, dev_a, dev_b, dev_c);
//add_kernel<<<1, 16>>>(LEN, dev_a, dev_b, dev_c); // Set gridDim.x & blockDim.x
CUDA_CHECK(cudaMemcpy(host_c, dev_c, LEN * sizeof(int), cudaMemcpyDeviceToHost));
VECTOR_PRINT("add_kernel results", host_c, LEN);
// Free gpu memory & free cpu memory
CUDA_CHECK(cudaFree(dev_a));
CUDA_CHECK(cudaFree(dev_b));
CUDA_CHECK(cudaFree(dev_c));
delete[] host_a;
delete[] host_b;
delete[] host_c;
return 0;
}
#include <cuda_runtime.h>
#include <iostream>
using std::cin;
using std::cout;
using std::endl;
#define CUDA_CHECK(x) \
{ cudaError_t cuda_error = x; \
if (cuda_error != cudaSuccess) \
cout << "cudaError_t: " << cuda_error << " != 0 " \
<< cudaGetErrorString(cuda_error) << endl; \
}
#define VECTOR_PRINT(head_str, vec, len) \
cout << head_str << ": {"; \
for (int i = 0; i < len - 1; ++i){ \
cout << vec[i] << ", "; \
} \
cout << vec[len - 1] << "}" << endl;
#define LEN 16
// kernel function
__global__ void dot_kernel(int N, int *a, int *b, int *c ) {
__shared__ int cache[512];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int cacheIndex = threadIdx.x;
int temp = 0;
while (tid < N) {
temp += a[tid] * b[tid];
tid += blockDim.x * gridDim.x;
}
cache[cacheIndex] = temp;//if blockDim == 1, then result = the sum of cache[].
//同步
__syncthreads();//make sure that all the threads in a block finish the procedure above
//规约求和
int i = blockDim.x/2;
while (i != 0) {
if (cacheIndex < i) {
cache[cacheIndex] += cache[cacheIndex + i];
}
__syncthreads();
i /= 2;
}
if (cacheIndex == 0) {
c[blockIdx.x] = cache[0];
}
}
int main() {
// host memory malloc & initial
int* host_a = new int[LEN];
int* host_b = new int[LEN];
int* host_c = new int[LEN];
for (int i = 0; i < LEN; ++i) {
host_a[i] = 1;
host_b[i] = i;
host_c[i] = 0;
}
// GPU device start
int device_id = 1;
CUDA_CHECK(cudaSetDevice(device_id));
cout << "Using GPU " << device_id << "." << endl;
// cudaMalloc & cudaMemcpy & cudaMemset
int* dev_a;
int* dev_b;
int* dev_c;
CUDA_CHECK(cudaMalloc((void**)&dev_a, LEN * sizeof(int)));
CUDA_CHECK(cudaMalloc((void**)&dev_b, LEN * sizeof(int)));
CUDA_CHECK(cudaMalloc((void**)&dev_c, LEN * sizeof(int)));
CUDA_CHECK(cudaMemcpy(dev_a, host_a, LEN * sizeof(int), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(dev_b, host_b, LEN * sizeof(int), cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemset(dev_c, 0, LEN * sizeof(int)));
// add_kernel & result copy & print
dim3 grid_dim(1, 1, 1); // gridDim.x, gridDim.y, gridDim.z
dim3 block_dim(16, 1, 1); // blockDim.x, blockDim.y, blockDim.z
const int blocksPerGrid = grid_dim.x * grid_dim.y * grid_dim.z;
dot_kernel<<<grid_dim, block_dim>>>(LEN, dev_a, dev_b, dev_c);
CUDA_CHECK(cudaMemcpy(host_c, dev_c, blocksPerGrid * sizeof(int), cudaMemcpyDeviceToHost));
int c = 0;
for (int i = 0; i < blocksPerGrid; ++i) {
c += host_c[i];
}
cout << "dot_kernel results: " << c << endl;
// Free gpu memory & free cpu memory
CUDA_CHECK(cudaFree(dev_a));
CUDA_CHECK(cudaFree(dev_b));
CUDA_CHECK(cudaFree(dev_c));
delete[] host_a;
delete[] host_b;
delete[] host_c;
return 0;
}
#include <cuda_runtime.h>
#include <iostream>
using std::cout;
using std::endl;
#define CUDA_CHECK(x) \
{ cudaError_t cuda_error = x; \
if (cuda_error != cudaSuccess) \
cout << "cudaError_t: " << cuda_error << " != 0 " \
<< cudaGetErrorString(cuda_error) << endl; \
}
#define VECTOR_PRINT(head_str, vec, len) \
cout << head_str << ": {"; \
for (int i = 0; i < len - 1; ++i){ \
cout << vec[i] << ", "; \
} \
cout << vec[len - 1] << "}" << endl;
#define LEN 32
// kernel functions
template<typename Dtype>
__global__ void set_kernel(const int N, Dtype* c){
int bid = blockIdx.y * gridDim.x + blockIdx.x;
int tid = threadIdx.z + threadIdx.y * blockDim.z + threadIdx.x * blockDim.z * blockDim.y + bid * blockDim.x * blockDim.y * blockDim.z;
c[tid] = blockIdx.x * 10000
+ blockIdx.y * 1000
+ threadIdx.x * 100
+ threadIdx.y * 10
+ threadIdx.z * 1;
}
int main(){
// host memory malloc & initial
int* host_a = new int[LEN];
for (int i = 0; i < LEN; ++i){
host_a[i] = 0;
}
// GPU device start
int device_id = 1;
CUDA_CHECK(cudaSetDevice(device_id));
cout << "Using GPU " << device_id << "." << endl;
// cudaMalloc & cudaMemcpy & cudaMemset
int* dev_a;
CUDA_CHECK(cudaMalloc((void**)&dev_a, LEN * sizeof(int)));
CUDA_CHECK(cudaMemcpy(dev_a, host_a, LEN * sizeof(int), cudaMemcpyHostToDevice));
// set_kernel & result copy & print
dim3 grid_dim(2, 2, 1); // gridDim.x, gridDim.y, gridDim.z (always 1)
dim3 block_dim(2, 2, 2); // blockDim.x, blockDim.y, blockDim.z
set_kernel<int><<<grid_dim, block_dim>>>(LEN, dev_a);
CUDA_CHECK(cudaMemcpy(host_a, dev_a, LEN * sizeof(int), cudaMemcpyDeviceToHost));
VECTOR_PRINT("set_kernel results", host_a, LEN);
// Free gpu memory & free cpu memory
CUDA_CHECK(cudaFree(dev_a));
delete[] host_a;
return 0;
}
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size);
__global__ void addKernel(int *c, const int *a, const int *b)
{
int i = blockIdx.x;
c[i] = a[i] + b[i];
}
int main()
{
const int arraySize = 5;
const int a[arraySize] = { 1, 2, 3, 4, 5 };
const int b[arraySize] = { 10, 20, 30, 40, 50 };
int c[arraySize] = { 0 };
// Add vectors in parallel.
cudaError_t cudaStatus;
int num = 0;
cudaDeviceProp prop;
cudaStatus = cudaGetDeviceCount(&num);
for(int i = 0;i<num;i++)
{
cudaGetDeviceProperties(&prop,i);
}
cudaStatus = addWithCuda(c, a, b, arraySize);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "addWithCuda failed!");
return 1;
}
printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",c[0],c[1],c[2],c[3],c[4]);
// cudaThreadExit must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
cudaStatus = cudaThreadExit();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaThreadExit failed!");
return 1;
}
return 0;
}
// Helper function for using CUDA to add vectors in parallel.
cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size)
{
int *dev_a = 0;
int *dev_b = 0;
int *dev_c = 0;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStream_t stream[5];
for(int i = 0;i<5;i++)
{
cudaStreamCreate(&stream[i]); //创建流
}
// Launch a kernel on the GPU with one thread for each element.
for(int i = 0;i<5;i++)
{
addKernel<<<1,1,0,stream[i]>>>(dev_c+i, dev_a+i, dev_b+i); //执行流
}
cudaDeviceSynchronize();
// cudaThreadSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaThreadSynchronize();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
for(int i = 0;i<5;i++)
{
cudaStreamDestroy(stream[i]); //销毁流
}
cudaFree(dev_c);
cudaFree(dev_a);
cudaFree(dev_b);
return cudaStatus;
}
#include <cuda_runtime.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
#include <iostream>
#include <algorithm>
using thrust::sort;
using thrust::device_ptr;
using std::cout;
using std::endl;
#define CUDA_CHECK(x) \
{ cudaError_t cuda_error = x; \
if (cuda_error != cudaSuccess) \
cout << "cudaError_t: " << cuda_error << " != 0 " \
<< cudaGetErrorString(cuda_error) << endl; \
}
#define VECTOR_PRINT(head_str, vec, LEN) \
cout << head_str << ": {"; \
for (int i = 0; i < LEN - 1; ++i){ \
cout << vec[i] << ", "; \
} \
cout << vec[LEN - 1] << "}" << endl;
#define LEN 32
template <typename Dtype>
struct MAX_OP{
__host__ __device__
Dtype operator() (const Dtype& x, const Dtype& y) const {
return x > y ? x : y;
}
};
int main(){
// host memory malloc & initial
float* host_a = new float[LEN];
for (int i = 0; i < LEN; ++i){
host_a[i] = LEN - i;
}
// GPU device start
int device_id = 1;
CUDA_CHECK(cudaSetDevice(device_id));
cout << "Using GPU " << device_id << "." << endl;
// cudaMalloc & cudaMemcpy & cudaMemset
float* dev_a;
CUDA_CHECK(cudaMalloc((void**)&dev_a, LEN * sizeof(float)));
CUDA_CHECK(cudaMemcpy(dev_a, host_a, LEN * sizeof(float), cudaMemcpyHostToDevice));
// thrust reduction max
device_ptr<float> dp(dev_a);
MAX_OP<float> max_op;
cout << "max: " << thrust::reduce(dp, dp + LEN, float(0), max_op) << endl;
// Free gpu memory & free cpu memory
CUDA_CHECK(cudaFree(dev_a));
delete[] host_a;
return 0;
}
#include <cuda_runtime.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
#include <iostream>
#include <algorithm>
using thrust::sort;
using thrust::device_ptr;
using std::cout;
using std::endl;
#define CUDA_CHECK(x) \
{ cudaError_t cuda_error = x; \
if (cuda_error != cudaSuccess) \
cout << "cudaError_t: " << cuda_error << " != 0 " \
<< cudaGetErrorString(cuda_error) << endl; \
}
#define VECTOR_PRINT(head_str, vec, LEN) \
cout << head_str << ": {"; \
for (int i = 0; i < LEN - 1; ++i){ \
cout << vec[i] << ", "; \
} \
cout << vec[LEN - 1] << "}" << endl;
#define LEN 32
int main(){
// host memory malloc & initial
float* host_a = new float[LEN];
float* host_b = new float[LEN];
for (int i = 0; i < LEN; ++i){
host_a[i] = LEN - i;
host_b[i] = LEN - i;
}
// GPU device start
int device_id = 1;
CUDA_CHECK(cudaSetDevice(device_id));
cout << "Using GPU " << device_id << "." << endl;
// cudaMalloc & cudaMemcpy & cudaMemset
float* dev_a;
CUDA_CHECK(cudaMalloc((void**)&dev_a, LEN * sizeof(float)));
CUDA_CHECK(cudaMemcpy(dev_a, host_a, LEN * sizeof(float), cudaMemcpyHostToDevice));
// thrust device sort
device_ptr<float> dp(dev_a);
sort(dp, dp + LEN);
CUDA_CHECK(cudaMemcpy(host_a, dev_a, LEN * sizeof(float), cudaMemcpyDeviceToHost));
VECTOR_PRINT("thrust sort", host_a, LEN);
// std::sort
for (int i = 0; i < LEN; ++i){
host_b[i] = float(LEN) - float(i);
}
std::sort(host_b, host_b+ LEN);
VECTOR_PRINT("std sort", host_b, LEN);
// Free gpu memory & free cpu memory
CUDA_CHECK(cudaFree(dev_a));
delete[] host_a;
delete[] host_b;
return 0;
}
#include <cuda_runtime.h>
#include <thrust/sort.h>
#include <thrust/device_ptr.h>
#include <iostream>
#include <algorithm>
using thrust::sort;
using thrust::device_ptr;
using std::cout;
using std::endl;
#define CUDA_CHECK(x) \
{ cudaError_t cuda_error = x; \
if (cuda_error != cudaSuccess) \
cout << "cudaError_t: " << cuda_error << " != 0 " \
<< cudaGetErrorString(cuda_error) << endl; \
}
#define VECTOR_PRINT(head_str, vec, LEN) \
cout << head_str << ": {"; \
for (int i = 0; i < LEN - 1; ++i){ \
cout << vec[i] << ", "; \
} \
cout << vec[LEN - 1] << "}" << endl;
#define LEN 32
int main(){
// host memory malloc & initial
float* host_a = new float[LEN];
for (int i = 0; i < LEN; ++i){
host_a[i] = LEN - i;
}
// GPU device start
int device_id = 1;
CUDA_CHECK(cudaSetDevice(device_id));
cout << "Using GPU " << device_id << "." << endl;
// cudaMalloc & cudaMemcpy & cudaMemset
float* dev_a;
CUDA_CHECK(cudaMalloc((void**)&dev_a, LEN * sizeof(float)));
CUDA_CHECK(cudaMemcpy(dev_a, host_a, LEN * sizeof(float), cudaMemcpyHostToDevice));
// thrust reduction max
device_ptr<float> dp(dev_a);
thrust::plus<float> add_op;
cout << "sum: " << thrust::reduce(dp, dp + LEN, float(100), add_op) << endl;
// Free gpu memory & free cpu memory
CUDA_CHECK(cudaFree(dev_a));
delete[] host_a;
return 0;
}
#include <cuda_runtime.h>
#include <thrust/sort.h>
#include <thrust/transform_reduce.h>
#include <thrust/device_ptr.h>
#include <iostream>
#include <algorithm>
using thrust::sort;
using thrust::device_ptr;
using std::cout;
using std::endl;
#define CUDA_CHECK(x) \
{ cudaError_t cuda_error = x; \
if (cuda_error != cudaSuccess) \
cout << "cudaError_t: " << cuda_error << " != 0 " \
<< cudaGetErrorString(cuda_error) << endl; \
}
#define VECTOR_PRINT(head_str, vec, LEN) \
cout << head_str << ": {"; \
for (int i = 0; i < LEN - 1; ++i){ \
cout << vec[i] << ", "; \
} \
cout << vec[LEN - 1] << "}" << endl;
#define LEN 5
// square<T> computes the square of a number f(x) -> x*x
template <typename T>
struct mean_square
{
T mean;
mean_square(T m){mean = m;}
__host__ __device__
T operator()(const T& x) const {
return (x-mean) * (x-mean);
// return x * x;
}
};
int main(){
// host memory malloc & initial
float* host_a = new float[LEN];
for (int i = 1; i < LEN; ++i){
host_a[i] = LEN - i;
}
// GPU device start
int device_id = 1;
CUDA_CHECK(cudaSetDevice(device_id));
cout << "Using GPU " << device_id << "." << endl;
// cudaMalloc & cudaMemcpy & cudaMemset
float* dev_a;
CUDA_CHECK(cudaMalloc((void**)&dev_a, LEN * sizeof(float)));
CUDA_CHECK(cudaMemcpy(dev_a, host_a, LEN * sizeof(float), cudaMemcpyHostToDevice));
// thrust reduction max
device_ptr<float> dp(dev_a);
mean_square<float> ms_op(1.0);
thrust::plus<float> add_op;
cout << "sum: " << thrust::transform_reduce(dp, dp + LEN, ms_op, float(0), add_op) << endl;
// Free gpu memory & free cpu memory
CUDA_CHECK(cudaFree(dev_a));
delete[] host_a;
return 0;
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment