20190103, add cuda_examples, add vector examples

1801e601 · songxinkai · d980982a · 1801e601 · 1801e601 · 1801e601
Commit 1801e601 authored Jan 03, 2019 by songxinkai
31 changed files
--- a/.gitignore
+++ b/.gitignore
+*core
+*a.out
--- a/c++file/binary_read_write/a.out
+++ b/c++file/binary_read_write/a.out
--- a/c++file/binary_read_write/main.cpp
+++ b/c++file/binary_read_write/main.cpp
@@ -64,7 +64,7 @@ void write(const string file){
 }
 int main (){
-  write("test.bin");
+  //write("test.bin");
  vector<vector<float> > feas;
  vector<vector<float> > pis;
  vector<float> vs;

--- a/c++file/binary_read_write/test.bin
+++ b/c++file/binary_read_write/test.bin
--- a/c++file/out.txt
+++ b/c++file/out.txt
-line1, 1, 2
-line2, 1, 2
-line1, 1, 2
-line2, 1, 2
--- a/c++python/out.bin
+++ b/c++python/out.bin
--- a/c++rand/.main.cpp.swo
+++ b/c++rand/.main.cpp.swo
--- a/c++thread/a.out
+++ b/c++thread/a.out
--- a/c++thread/main.cpp
+++ b/c++thread/main.cpp
 #include <iostream>
-#include <utility>
 #include <thread>
 #include <chrono>
-#include <functional>
-#include <atomic>
 void f1(int n)
 {
@@ -13,12 +10,13 @@ void f1(int n)
    }
 }
-void f2(int n)
+void f2(int& n)
 {
    for (int i = 0; i < n; ++i) {
        std::cout << "Thread 2: " << i << std::endl;
        std::this_thread::sleep_for(std::chrono::milliseconds(10));
    }
+    n ++;
 }
 int main()
@@ -26,9 +24,9 @@ int main()
    int n = 100;
    std::thread t1; // t1 is not a thread
    std::thread t2(f1, n); // pass by value
-    std::thread t3(f2, n); // pass by reference
+    std::thread t3(f2, std::ref(n)); // pass by reference
    std::thread t4(std::move(t3)); // t4 is now running f2(). t3 is no longer a thread
-    std::thread t5(f2, n); // pass by reference
+    std::thread t5(f2, std::ref(n)); // pass by reference
    t2.join();
    t4.join();
    t5.join();

--- a/c++thread/main1.cpp
+++ b/c++thread/main1.cpp
 #include <iostream>
+#include <thread>
+#include <vector>
+#include <atomic>
+#include <mutex>
-int main()
+using namespace std;
-{
+mutex mtx;
-    int n = 0;
-    int a = 1;
+void at_fn1 (atomic<int>* a, int N){
-    std::cout << "Final value of n is " << n << '\n';
+  for (int i = 0; i < N; ++i){
+    (*a) ++;
+  }
+}
+void mtx_fn2(vector<int> a) {
+  mtx.lock();
+  a.resize();
+  mtx.unlock();
+}
+int main(){
+  int N = 10, M = 100;
+  vector<thread> my_ths;
+  atomic<int> at_a(1);
+  for (int i = 0; i < N; ++i){
+    my_ths.emplace_back(at_fn1, &at_a, 10000);
+  }
+  for (auto &th: my_ths){
+    th.join();
+  }
+  cout << at_a.load() << endl;
+  return 0;
 }
--- a/c++thread/main3.cpp
+++ b/c++thread/main3.cpp
@@ -24,4 +24,3 @@ int main(){
  cout << at_a.load() << endl;
  return 0;
 }
--- a/c++thread/test.cpp
+++ b/c++thread/test.cpp
+//#include <thread>
+#include <functional>
+void fn(int& a){a ++;}
+int main (){
+  int a = 0;
+  std::ref(a) ++;
+  //std::thread t(fn, std::ref(a));
+  //t.join();
+  return 0;
+}
--- a/c++time/a.out
+++ b/c++time/a.out
--- a/c++time/main.cpp
+++ b/c++time/main.cpp
@@ -16,12 +16,12 @@ int main(){
  clock_t clc_a = 0, clc_b = 0;
  vector<double> v;
  clc_a=clock(); // time.h
-  for (int i = 0; i < 100000000; ++i){
+  for (int i = 0; i < 10000000; ++i){
    v.push_back(double(i)*i);
  }
  clc_b=clock();
  cout << clc_a << ", " << clc_b <<", CLOCKS_PER_SEC: " << CLOCKS_PER_SEC<< endl;
-  cout << (clc_b - clc_a) / CLOCKS_PER_SEC << endl;
+  cout << float(clc_b - clc_a) / CLOCKS_PER_SEC << endl;
  clc_a=clock(); // time.h
  sleep(2); // NO CPU clock during sleep

--- a/c++vector/.multithreads_vector_correct_example.cpp.swp
+++ b/c++vector/.multithreads_vector_correct_example.cpp.swp
--- a/c++vector/a.out
+++ b/c++vector/a.out
--- a/c++vector/multithreads_vector_correct
+++ b/c++vector/multithreads_vector_correct
--- a/c++vector/multithreads_vector_correct_example.cpp
+++ b/c++vector/multithreads_vector_correct_example.cpp
+/*
+ g++ multithreads_vector_correct_example.cpp -o multithreads_vector_correct -std=c++11 -pthread
+*/
+#include <iostream>
+#include <vector>
+#include <unistd.h>
+#include <thread>
+#include <time.h>
+using namespace std;
+void fn(vector<vector<float> >& vec){
+  while(true){
+    int len = vec.size();
+    if (len > 0 && vec[len-1].size()>0){
+      cout << "thread 1, vec.size() = " << len;
+      cout << ", begin addr: "  << &*(vec.begin());
+      cout << ", vec[-1][0] = " << vec[len-1][0] << endl;
+    }
+    sleep(0.2);
+  }
+}
+int main(){
+  vector<vector<float> > b;
+  int N = 100000, M = 19*19*17;
+  b.resize(N);
+  for (int i = 0; i < N; ++i){
+    b[i].resize(M);
+  }
+  cout << "b init done." << endl;
+  thread t1(fn, ref(b));
+  for (int i = 0; i < N; ++i){
+    for (int j = 0; j < M; ++j){
+      b[i][j] = float(j);
+    }
+  }
+  t1.join();
+  return 0;
+}
--- a/c++vector/multithreads_vector_wrong
+++ b/c++vector/multithreads_vector_wrong
--- a/c++vector/multithreads_vector_wrong_example.cpp
+++ b/c++vector/multithreads_vector_wrong_example.cpp
+/*
+ g++ multithreads_vector_wrong_example.cpp -o multithreads_vector_wrong -std=c++11 -pthread
+*/
+#include <iostream>
+#include <vector>
+#include <unistd.h>
+#include <thread>
+#include <time.h>
+using namespace std;
+void fn(vector<vector<float> >& vec){
+  while(true){
+    int len = vec.size();
+    if (len > 0 && vec[len-1].size()>0){
+      cout << "thread 1, vec.size() = " << len;
+      cout << ", begin addr: "  << &*(vec.begin());
+      cout << ", vec[-1][0] = " << vec[len-1][0] << endl;
+    }
+    sleep(0.2);
+  }
+}
+int main(){
+  vector<vector<float> > b;
+  thread t1(fn, ref(b));
+  int N = 4000000, M = 19*19*17;
+  for (int i = 0; i < N; ++i){
+    vector<float> fea(M, i);
+    b.push_back(fea);
+    if ((int)b.size() == (int)b.capacity()){
+      cout << &*(b.begin()) << ", " << &*(b.begin()+1) << ", " << &*(b[0].begin()) << ", " << &*(b[0].begin()+1) << ", capacity grow: " << (int)b.capacity() << endl;
+    }
+  }
+  t1.join();
+  return 0;
+}
--- a/c++vector/size_capacity
+++ b/c++vector/size_capacity
--- a/c++vector/size_capacity.cpp
+++ b/c++vector/size_capacity.cpp
+/*
+  g++ size_capacity.cpp -o size_capacity -pthread -std=c++11
+*/
+#include <iostream>
+#include <vector>
+#include <time.h>
+#include <math.h>
+using namespace std;
+int main(){
+  vector<long long> vec;
+  clock_t clc_0 = 0, clc_1 = 0; 
+  for (int i = 0; i < 100; ++i){
+    vec.resize(pow(2, i+20)*10);
+    cout << "old begin: " << &*(vec.begin());
+    cout << ", old capacity: " << (long long)vec.capacity();
+    clc_0 = clock();
+    vec.push_back(0);
+    clc_1 = clock();
+    cout << ", new begin: " << &*(vec.begin());
+    cout << ", push_back time: " << float(clc_1 - clc_0) / CLOCKS_PER_SEC;
+    cout << ", size: " << (long long)vec.size();
+    cout << ", capacity: " << (long long)vec.capacity() << endl;
+  }
+  return 0;
+}
--- a/c++vector/watch_mem.py
+++ b/c++vector/watch_mem.py
+#!/usr/bin/python
+import os
+import time
+while True:
+    os.system("free -h")
+    time.sleep(0.2)
--- a/cuda/add.cu
+++ b/cuda/add.cu
+#include <cuda_runtime.h>
+#include <iostream>
+using std::cout;
+using std::endl;
+#define CUDA_CHECK(x) \
+    { cudaError_t cuda_error = x; \
+        if (cuda_error != cudaSuccess) \
+            cout << "cudaError_t: " << cuda_error << " != 0 " \
+                 << cudaGetErrorString(cuda_error) << endl; \
+    }
+#define VECTOR_PRINT(head_str, vec, len) \
+    cout << head_str << ": {"; \
+    for (int i = 0; i < len - 1; ++i){ \
+        cout << vec[i] << ", "; \
+    } \
+    cout << vec[len - 1] << "}" << endl;
+#define LEN 34
+// kernel functions
+template<typename Dtype>
+__global__ void add_kernel(const int N, const Dtype* a, const Dtype* b, Dtype* c){
+    int i = threadIdx.x; // thread index in block
+    // c[i] = a[i] + b[i];
+    for (int i = threadIdx.x; i < N; i += gridDim.x * blockDim.x){
+        c[i] = a[i] + b[i];
+    }
+}
+int main(){
+    // host memory malloc & initial
+    int* host_a = new int[LEN];
+    int* host_b = new int[LEN];
+    int* host_c = new int[LEN];
+    for (int i = 0; i < LEN; ++i){
+        host_a[i] = i;
+        host_b[i] = i * 100;
+        host_c[i] = -1;
+    }
+    // GPU device start
+    int device_id = 1;
+    CUDA_CHECK(cudaSetDevice(device_id));
+    cout << "Using GPU " << device_id << "." << endl;
+    // cudaMalloc & cudaMemcpy & cudaMemset
+    int* dev_a;
+    int* dev_b;
+    int* dev_c;
+    CUDA_CHECK(cudaMalloc((void**)&dev_a, LEN * sizeof(int)));
+    CUDA_CHECK(cudaMalloc((void**)&dev_b, LEN * sizeof(int)));
+    CUDA_CHECK(cudaMalloc((void**)&dev_c, LEN * sizeof(int)));
+    CUDA_CHECK(cudaMemcpy(dev_a, host_a, LEN * sizeof(int), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(dev_b, host_b, LEN * sizeof(int), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemset(dev_c, 0, LEN * sizeof(int))); // Set value by byte
+    // add_kernel & result copy & print
+    dim3 grid_dim(1, 1, 1);   // gridDim.x,  gridDim.y,  gridDim.z (always 1)
+    dim3 block_dim(16, 1, 1); // blockDim.x, blockDim.y, blockDim.z
+    add_kernel<int><<<grid_dim, block_dim>>>(LEN, dev_a, dev_b, dev_c);
+    //add_kernel<<<1, 16>>>(LEN, dev_a, dev_b, dev_c); // Set gridDim.x & blockDim.x
+    CUDA_CHECK(cudaMemcpy(host_c, dev_c, LEN * sizeof(int), cudaMemcpyDeviceToHost));
+    VECTOR_PRINT("add_kernel results", host_c, LEN);
+    // Free gpu memory & free cpu memory
+    CUDA_CHECK(cudaFree(dev_a));
+    CUDA_CHECK(cudaFree(dev_b));
+    CUDA_CHECK(cudaFree(dev_c));
+    delete[] host_a;
+    delete[] host_b;
+    delete[] host_c;
+    return 0;
+}
--- a/cuda/dot.cu
+++ b/cuda/dot.cu
+#include <cuda_runtime.h>
+#include <iostream>
+using std::cin;
+using std::cout;
+using std::endl;
+#define CUDA_CHECK(x) \
+    { cudaError_t cuda_error = x; \
+        if (cuda_error != cudaSuccess) \
+            cout << "cudaError_t: " << cuda_error << " != 0 " \
+                 << cudaGetErrorString(cuda_error) << endl; \
+    }
+#define VECTOR_PRINT(head_str, vec, len) \
+    cout << head_str << ": {"; \
+    for (int i = 0; i < len - 1; ++i){ \
+        cout << vec[i] << ", "; \
+    } \
+    cout << vec[len - 1] << "}" << endl;
+#define LEN 16
+// kernel function
+__global__ void dot_kernel(int N, int *a, int *b, int *c ) { 
+    __shared__ int cache[512];
+    int tid = threadIdx.x + blockIdx.x * blockDim.x;
+    int cacheIndex = threadIdx.x;
+    int temp = 0;
+    while (tid < N) {
+        temp += a[tid] * b[tid];
+        tid += blockDim.x * gridDim.x;
+    }
+    cache[cacheIndex] = temp;//if blockDim == 1, then result = the sum of cache[].
+    //同步
+    __syncthreads();//make sure that all the threads in a block finish the procedure above
+    //规约求和
+    int i = blockDim.x/2;
+    while (i != 0) {
+        if (cacheIndex < i) {
+            cache[cacheIndex] += cache[cacheIndex + i];
+        }
+        __syncthreads();
+        i /= 2;
+    }
+    if (cacheIndex == 0) {
+        c[blockIdx.x] = cache[0];
+    }
+}
+int main() {
+    // host memory malloc & initial
+    int* host_a = new int[LEN];
+    int* host_b = new int[LEN];
+    int* host_c = new int[LEN];
+    for (int i = 0; i < LEN; ++i) {
+        host_a[i] = 1;
+        host_b[i] = i;
+        host_c[i] = 0;
+    }
+    // GPU device start
+    int device_id = 1;
+    CUDA_CHECK(cudaSetDevice(device_id));
+    cout << "Using GPU " << device_id << "." << endl;
+    // cudaMalloc & cudaMemcpy & cudaMemset
+    int* dev_a;
+    int* dev_b;
+    int* dev_c;
+    CUDA_CHECK(cudaMalloc((void**)&dev_a, LEN * sizeof(int)));
+    CUDA_CHECK(cudaMalloc((void**)&dev_b, LEN * sizeof(int)));
+    CUDA_CHECK(cudaMalloc((void**)&dev_c, LEN * sizeof(int)));
+    CUDA_CHECK(cudaMemcpy(dev_a, host_a, LEN * sizeof(int), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemcpy(dev_b, host_b, LEN * sizeof(int), cudaMemcpyHostToDevice));
+    CUDA_CHECK(cudaMemset(dev_c, 0, LEN * sizeof(int)));
+    // add_kernel & result copy & print
+    dim3 grid_dim(1, 1, 1);   // gridDim.x,  gridDim.y,  gridDim.z
+    dim3 block_dim(16, 1, 1); // blockDim.x, blockDim.y, blockDim.z
+    const int blocksPerGrid = grid_dim.x * grid_dim.y * grid_dim.z;
+    dot_kernel<<<grid_dim, block_dim>>>(LEN, dev_a, dev_b, dev_c);
+    CUDA_CHECK(cudaMemcpy(host_c, dev_c, blocksPerGrid * sizeof(int), cudaMemcpyDeviceToHost));
+    int c = 0;
+    for (int i = 0; i < blocksPerGrid; ++i) {
+        c += host_c[i];
+    }
+    cout << "dot_kernel results: " << c << endl;        
+    // Free gpu memory & free cpu memory
+    CUDA_CHECK(cudaFree(dev_a));
+    CUDA_CHECK(cudaFree(dev_b));
+    CUDA_CHECK(cudaFree(dev_c));
+    delete[] host_a;
+    delete[] host_b;
+    delete[] host_c;
+    return 0;
+}
--- a/cuda/set.cu
+++ b/cuda/set.cu
+#include <cuda_runtime.h>
+#include <iostream>
+using std::cout;
+using std::endl;
+#define CUDA_CHECK(x) \
+    { cudaError_t cuda_error = x; \
+        if (cuda_error != cudaSuccess) \
+            cout << "cudaError_t: " << cuda_error << " != 0 " \
+                 << cudaGetErrorString(cuda_error) << endl; \
+    }
+#define VECTOR_PRINT(head_str, vec, len) \
+    cout << head_str << ": {"; \
+    for (int i = 0; i < len - 1; ++i){ \
+        cout << vec[i] << ", "; \
+    } \
+    cout << vec[len - 1] << "}" << endl;
+#define LEN 32
+// kernel functions
+template<typename Dtype>
+__global__ void set_kernel(const int N, Dtype* c){
+    int bid = blockIdx.y * gridDim.x + blockIdx.x;
+    int tid = threadIdx.z + threadIdx.y * blockDim.z + threadIdx.x * blockDim.z * blockDim.y + bid * blockDim.x * blockDim.y * blockDim.z;
+    c[tid] = blockIdx.x * 10000
+         + blockIdx.y * 1000
+         + threadIdx.x * 100
+         + threadIdx.y * 10
+         + threadIdx.z * 1;
+}
+int main(){
+    // host memory malloc & initial
+    int* host_a = new int[LEN];
+    for (int i = 0; i < LEN; ++i){
+        host_a[i] = 0;
+    }
+    // GPU device start
+    int device_id = 1;
+    CUDA_CHECK(cudaSetDevice(device_id));
+    cout << "Using GPU " << device_id << "." << endl;
+    // cudaMalloc & cudaMemcpy & cudaMemset
+    int* dev_a;
+    CUDA_CHECK(cudaMalloc((void**)&dev_a, LEN * sizeof(int)));
+    CUDA_CHECK(cudaMemcpy(dev_a, host_a, LEN * sizeof(int), cudaMemcpyHostToDevice));
+    // set_kernel & result copy & print
+    dim3 grid_dim(2, 2, 1);   // gridDim.x,  gridDim.y,  gridDim.z (always 1)
+    dim3 block_dim(2, 2, 2); // blockDim.x, blockDim.y, blockDim.z
+    set_kernel<int><<<grid_dim, block_dim>>>(LEN, dev_a);
+    CUDA_CHECK(cudaMemcpy(host_a, dev_a, LEN * sizeof(int), cudaMemcpyDeviceToHost));
+    VECTOR_PRINT("set_kernel results", host_a, LEN);
+    // Free gpu memory & free cpu memory
+    CUDA_CHECK(cudaFree(dev_a));
+    delete[] host_a;
+    return 0;
+}
--- a/cuda/stream_example.cu
+++ b/cuda/stream_example.cu
+#include "cuda_runtime.h"  
+#include "device_launch_parameters.h"  
+#include <stdio.h>  
+cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size);  
+__global__ void addKernel(int *c, const int *a, const int *b)  
+{  
+    int i = blockIdx.x;  
+    c[i] = a[i] + b[i];  
+}  
+int main()  
+{  
+    const int arraySize = 5;  
+    const int a[arraySize] = { 1, 2, 3, 4, 5 };  
+    const int b[arraySize] = { 10, 20, 30, 40, 50 };  
+    int c[arraySize] = { 0 };  
+    // Add vectors in parallel.  
+    cudaError_t cudaStatus;  
+    int num = 0;  
+    cudaDeviceProp prop;  
+    cudaStatus = cudaGetDeviceCount(&num);  
+    for(int i = 0;i<num;i++)  
+    {  
+        cudaGetDeviceProperties(&prop,i);  
+    }  
+    cudaStatus = addWithCuda(c, a, b, arraySize);  
+    if (cudaStatus != cudaSuccess)   
+    {  
+        fprintf(stderr, "addWithCuda failed!");  
+        return 1;  
+    }  
+    printf("{1,2,3,4,5} + {10,20,30,40,50} = {%d,%d,%d,%d,%d}\n",c[0],c[1],c[2],c[3],c[4]);  
+    // cudaThreadExit must be called before exiting in order for profiling and  
+    // tracing tools such as Nsight and Visual Profiler to show complete traces.  
+    cudaStatus = cudaThreadExit();  
+    if (cudaStatus != cudaSuccess)   
+    {  
+        fprintf(stderr, "cudaThreadExit failed!");  
+        return 1;  
+    }  
+    return 0;  
+}  
+// Helper function for using CUDA to add vectors in parallel.  
+cudaError_t addWithCuda(int *c, const int *a, const int *b, size_t size)  
+{  
+    int *dev_a = 0;  
+    int *dev_b = 0;  
+    int *dev_c = 0;  
+    cudaError_t cudaStatus;  
+    // Choose which GPU to run on, change this on a multi-GPU system.  
+    cudaStatus = cudaSetDevice(0);  
+    if (cudaStatus != cudaSuccess)   
+    {  
+        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");  
+        goto Error;  
+    }  
+    // Allocate GPU buffers for three vectors (two input, one output)    .  
+    cudaStatus = cudaMalloc((void**)&dev_c, size * sizeof(int));  
+    if (cudaStatus != cudaSuccess)   
+    {  
+        fprintf(stderr, "cudaMalloc failed!");  
+        goto Error;  
+    }  
+    cudaStatus = cudaMalloc((void**)&dev_a, size * sizeof(int));  
+    if (cudaStatus != cudaSuccess)   
+    {  
+        fprintf(stderr, "cudaMalloc failed!");  
+        goto Error;  
+    }  
+    cudaStatus = cudaMalloc((void**)&dev_b, size * sizeof(int));  
+    if (cudaStatus != cudaSuccess)   
+    {  
+        fprintf(stderr, "cudaMalloc failed!");  
+        goto Error;  
+    }  
+    // Copy input vectors from host memory to GPU buffers.  
+    cudaStatus = cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);  
+    if (cudaStatus != cudaSuccess)   
+    {  
+        fprintf(stderr, "cudaMemcpy failed!");  
+        goto Error;  
+    }  
+    cudaStatus = cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);  
+    if (cudaStatus != cudaSuccess)   
+    {  
+        fprintf(stderr, "cudaMemcpy failed!");  
+        goto Error;  
+    }  
+    cudaStream_t stream[5];  
+    for(int i = 0;i<5;i++)  
+    {  
+        cudaStreamCreate(&stream[i]);   //创建流  
+    }  
+    // Launch a kernel on the GPU with one thread for each element.  
+    for(int i = 0;i<5;i++)  
+    {  
+        addKernel<<<1,1,0,stream[i]>>>(dev_c+i, dev_a+i, dev_b+i);    //执行流  
+    }  
+    cudaDeviceSynchronize();  
+    // cudaThreadSynchronize waits for the kernel to finish, and returns  
+    // any errors encountered during the launch.  
+    cudaStatus = cudaThreadSynchronize();  
+    if (cudaStatus != cudaSuccess)   
+    {  
+        fprintf(stderr, "cudaThreadSynchronize returned error code %d after launching addKernel!\n", cudaStatus);  
+        goto Error;  
+    }  
+    // Copy output vector from GPU buffer to host memory.  
+    cudaStatus = cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);  
+    if (cudaStatus != cudaSuccess)   
+    {  
+        fprintf(stderr, "cudaMemcpy failed!");  
+        goto Error;  
+    }  
+Error:  
+    for(int i = 0;i<5;i++)  
+    {  
+        cudaStreamDestroy(stream[i]);   //销毁流  
+    }  
+    cudaFree(dev_c);  
+    cudaFree(dev_a);  
+    cudaFree(dev_b);      
+    return cudaStatus;  
+}
--- a/cuda/thrust_max.cu
+++ b/cuda/thrust_max.cu
+#include <cuda_runtime.h>
+#include <thrust/sort.h>
+#include <thrust/device_ptr.h>
+#include <iostream>
+#include <algorithm>
+using thrust::sort;
+using thrust::device_ptr;
+using std::cout;
+using std::endl;
+#define CUDA_CHECK(x) \
+    { cudaError_t cuda_error = x; \
+        if (cuda_error != cudaSuccess) \
+            cout << "cudaError_t: " << cuda_error << " != 0 " \
+                 << cudaGetErrorString(cuda_error) << endl; \
+    }
+#define VECTOR_PRINT(head_str, vec, LEN) \
+    cout << head_str << ": {"; \
+    for (int i = 0; i < LEN - 1; ++i){ \
+        cout << vec[i] << ", "; \
+    } \
+    cout << vec[LEN - 1] << "}" << endl;
+#define LEN 32
+template <typename Dtype>
+struct MAX_OP{
+    __host__ __device__ 
+    Dtype operator() (const Dtype& x, const Dtype& y) const {
+        return x > y ? x : y;
+    }
+};
+int main(){
+    // host memory malloc & initial
+    float* host_a = new float[LEN];
+    for (int i = 0; i < LEN; ++i){
+        host_a[i] = LEN - i;
+    }
+    // GPU device start
+    int device_id = 1;
+    CUDA_CHECK(cudaSetDevice(device_id));
+    cout << "Using GPU " << device_id << "." << endl;
+    // cudaMalloc & cudaMemcpy & cudaMemset
+    float* dev_a;
+    CUDA_CHECK(cudaMalloc((void**)&dev_a, LEN * sizeof(float)));
+    CUDA_CHECK(cudaMemcpy(dev_a, host_a,  LEN * sizeof(float), cudaMemcpyHostToDevice));
+    // thrust reduction max
+    device_ptr<float> dp(dev_a);
+    MAX_OP<float> max_op;
+    cout << "max: " << thrust::reduce(dp, dp + LEN, float(0), max_op) << endl;
+    // Free gpu memory & free cpu memory
+    CUDA_CHECK(cudaFree(dev_a));
+    delete[] host_a;
+    return 0;
+}
--- a/cuda/thrust_sort.cu
+++ b/cuda/thrust_sort.cu
+#include <cuda_runtime.h>
+#include <thrust/sort.h>
+#include <thrust/device_ptr.h>
+#include <iostream>
+#include <algorithm>
+using thrust::sort;
+using thrust::device_ptr;
+using std::cout;
+using std::endl;
+#define CUDA_CHECK(x) \
+    { cudaError_t cuda_error = x; \
+        if (cuda_error != cudaSuccess) \
+            cout << "cudaError_t: " << cuda_error << " != 0 " \
+                 << cudaGetErrorString(cuda_error) << endl; \
+    }
+#define VECTOR_PRINT(head_str, vec, LEN) \
+    cout << head_str << ": {"; \
+    for (int i = 0; i < LEN - 1; ++i){ \
+        cout << vec[i] << ", "; \
+    } \
+    cout << vec[LEN - 1] << "}" << endl;
+#define LEN 32
+int main(){
+    // host memory malloc & initial
+    float* host_a = new float[LEN];
+    float* host_b = new float[LEN];
+    for (int i = 0; i < LEN; ++i){
+        host_a[i] = LEN - i;
+        host_b[i] = LEN - i;
+    }
+    // GPU device start
+    int device_id = 1;
+    CUDA_CHECK(cudaSetDevice(device_id));
+    cout << "Using GPU " << device_id << "." << endl;
+    // cudaMalloc & cudaMemcpy & cudaMemset
+    float* dev_a;
+    CUDA_CHECK(cudaMalloc((void**)&dev_a, LEN * sizeof(float)));
+    CUDA_CHECK(cudaMemcpy(dev_a, host_a,  LEN * sizeof(float), cudaMemcpyHostToDevice));
+    // thrust device sort
+    device_ptr<float> dp(dev_a);
+    sort(dp, dp + LEN);
+    CUDA_CHECK(cudaMemcpy(host_a, dev_a, LEN * sizeof(float), cudaMemcpyDeviceToHost));
+    VECTOR_PRINT("thrust sort", host_a, LEN);
+    // std::sort
+    for (int i = 0; i < LEN; ++i){
+        host_b[i] = float(LEN) - float(i);
+    }
+    std::sort(host_b, host_b+ LEN);
+    VECTOR_PRINT("std sort", host_b, LEN);
+    // Free gpu memory & free cpu memory
+    CUDA_CHECK(cudaFree(dev_a));
+    delete[] host_a;
+    delete[] host_b;
+    return 0;
+}
--- a/cuda/thrust_sum.cu
+++ b/cuda/thrust_sum.cu
+#include <cuda_runtime.h>
+#include <thrust/sort.h>
+#include <thrust/device_ptr.h>
+#include <iostream>
+#include <algorithm>
+using thrust::sort;
+using thrust::device_ptr;
+using std::cout;
+using std::endl;
+#define CUDA_CHECK(x) \
+    { cudaError_t cuda_error = x; \
+        if (cuda_error != cudaSuccess) \
+            cout << "cudaError_t: " << cuda_error << " != 0 " \
+                 << cudaGetErrorString(cuda_error) << endl; \
+    }
+#define VECTOR_PRINT(head_str, vec, LEN) \
+    cout << head_str << ": {"; \
+    for (int i = 0; i < LEN - 1; ++i){ \
+        cout << vec[i] << ", "; \
+    } \
+    cout << vec[LEN - 1] << "}" << endl;
+#define LEN 32
+int main(){
+    // host memory malloc & initial
+    float* host_a = new float[LEN];
+    for (int i = 0; i < LEN; ++i){
+        host_a[i] = LEN - i;
+    }
+    // GPU device start
+    int device_id = 1;
+    CUDA_CHECK(cudaSetDevice(device_id));
+    cout << "Using GPU " << device_id << "." << endl;
+    // cudaMalloc & cudaMemcpy & cudaMemset
+    float* dev_a;
+    CUDA_CHECK(cudaMalloc((void**)&dev_a, LEN * sizeof(float)));
+    CUDA_CHECK(cudaMemcpy(dev_a, host_a,  LEN * sizeof(float), cudaMemcpyHostToDevice));
+    // thrust reduction max
+    device_ptr<float> dp(dev_a);
+    thrust::plus<float> add_op;
+    cout << "sum: " << thrust::reduce(dp, dp + LEN, float(100), add_op) << endl;
+    // Free gpu memory & free cpu memory
+    CUDA_CHECK(cudaFree(dev_a));
+    delete[] host_a;
+    return 0;
+}
--- a/cuda/thrust_var.cu
+++ b/cuda/thrust_var.cu
+#include <cuda_runtime.h>
+#include <thrust/sort.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/device_ptr.h>
+#include <iostream>
+#include <algorithm>
+using thrust::sort;
+using thrust::device_ptr;
+using std::cout;
+using std::endl;
+#define CUDA_CHECK(x) \
+    { cudaError_t cuda_error = x; \
+        if (cuda_error != cudaSuccess) \
+            cout << "cudaError_t: " << cuda_error << " != 0 " \
+                 << cudaGetErrorString(cuda_error) << endl; \
+    }
+#define VECTOR_PRINT(head_str, vec, LEN) \
+    cout << head_str << ": {"; \
+    for (int i = 0; i < LEN - 1; ++i){ \
+        cout << vec[i] << ", "; \
+    } \
+    cout << vec[LEN - 1] << "}" << endl;
+#define LEN 5
+// square<T> computes the square of a number f(x) -> x*x
+template <typename T>
+struct mean_square
+{
+  T mean;
+  mean_square(T m){mean = m;}
+  __host__ __device__
+  T operator()(const T& x) const {
+  return (x-mean) * (x-mean);
+  //  return x * x;
+  }
+};
+int main(){
+    // host memory malloc & initial
+    float* host_a = new float[LEN];
+    for (int i = 1; i < LEN; ++i){
+        host_a[i] = LEN - i;
+    }
+    // GPU device start
+    int device_id = 1;
+    CUDA_CHECK(cudaSetDevice(device_id));
+    cout << "Using GPU " << device_id << "." << endl;
+    // cudaMalloc & cudaMemcpy & cudaMemset
+    float* dev_a;
+    CUDA_CHECK(cudaMalloc((void**)&dev_a, LEN * sizeof(float)));
+    CUDA_CHECK(cudaMemcpy(dev_a, host_a,  LEN * sizeof(float), cudaMemcpyHostToDevice));
+    // thrust reduction max
+    device_ptr<float> dp(dev_a);
+    mean_square<float> ms_op(1.0);
+    thrust::plus<float> add_op;
+    cout << "sum: " << thrust::transform_reduce(dp, dp + LEN, ms_op, float(0), add_op) << endl;
+    // Free gpu memory & free cpu memory
+    CUDA_CHECK(cudaFree(dev_a));
+    delete[] host_a;
+    return 0;
+}