initial commit

9adab13b · lvzhengyang · 9adab13b · 9adab13b · 9adab13b · 9adab13b
Commit 9adab13b authored Nov 12, 2021 by lvzhengyang
Hide whitespace changes
Inline Side-by-side

Showing with 273 additions and 0 deletions

CMakeLists.txt
+21 -0

README.md
+9 -0

cuBLASConfig.cmake
+81 -0

include/test.h
+14 -0

main.cu
+102 -0

src/test.cu
+46 -0

No files found.
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
+cmake_minimum_required(VERSION 3.14)
+project(CUDA_MAT_MUL LANGUAGES CXX CUDA)
+
+# set(cuBLAS_DIR .)
+# find_package(cuBLAS)
+set(CMAKE_CXX_STANDARD 11)
+if(MSVC)
+   add_compile_options("/std:c++latest")
+endif()
+
+add_definitions(-D DEBUG)
+set(CUDA_NVCC_FLAGS -G;-g)
+
+include_directories(include)
+# file(GLOB_RECURSE eden_sourcefile "src/*.cpp")
+aux_source_directory(./src/ sourcefile)
+
+add_library(lib ${sourcefile})
+add_executable(cudaMatMul main.cu)
+# target_link_libraries(cudaMatMul PRIVATE ${CUBLAS_LIBRARES})
+target_link_libraries(cudaMatMul lib)
--- a/README.md
+++ b/README.md
+# CUDA Demo
+
+* This project provides a project structure for cuda project.
+* To build the project, just run `mkdir build && cd build && cmake .. && make `
+
+### Created by
+
+Lv Zhengyang (吕政阳)
+
--- a/cuBLASConfig.cmake
+++ b/cuBLASConfig.cmake
+# ==================================================================================================
+# This file is part of the cuBLASt project. The project is licensed under Apache Version 2.0. This
+# project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
+# width of 100 characters per line.
+#
+# Author(s):
+#   Cedric Nugteren <www.cedricnugteren.nl>
+#
+# ==================================================================================================
+#
+# Defines the following variables:
+#   CUBLAS_FOUND          Boolean holding whether or not the cuBLAS library was found
+#   CUBLAS_INCLUDE_DIRS   The CUDA and cuBLAS include directory
+#   CUDA_LIBRARIES        The CUDA library
+#   CUBLAS_LIBRARIES      The cuBLAS library
+#
+# In case CUDA is not installed in the default directory, set the CUDA_ROOT variable to point to
+# the root of cuBLAS, such that 'cublas_v2.h' can be found in $CUDA_ROOT/include. This can either be
+# done using an environmental variable (e.g. export CUDA_ROOT=/path/to/cuBLAS) or using a CMake
+# variable (e.g. cmake -DCUDA_ROOT=/path/to/cuBLAS ..).
+#
+# ==================================================================================================
+
+# Sets the possible install locations
+set(CUBLAS_HINTS
+  ${CUDA_ROOT}
+  $ENV{CUDA_ROOT}
+  $ENV{CUDA_TOOLKIT_ROOT_DIR}
+)
+set(CUBLAS_PATHS
+  /usr
+  /usr/local
+  /usr/local/cuda
+)
+
+# Finds the include directories
+find_path(CUBLAS_INCLUDE_DIRS
+  NAMES cublas_v2.h cuda.h
+  HINTS ${CUBLAS_HINTS}
+  PATH_SUFFIXES include inc include/x86_64 include/x64
+  PATHS ${CUBLAS_PATHS}
+  DOC "cuBLAS include header cublas_v2.h"
+)
+mark_as_advanced(CUBLAS_INCLUDE_DIRS)
+
+# Finds the libraries
+find_library(CUDA_LIBRARIES
+  NAMES cudart
+  HINTS ${CUBLAS_HINTS}
+  PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import
+  PATHS ${CUBLAS_PATHS}
+  DOC "CUDA library"
+)
+mark_as_advanced(CUDA_LIBRARIES)
+find_library(CUBLAS_LIBRARIES
+  NAMES cublas
+  HINTS ${CUBLAS_HINTS}
+  PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import
+  PATHS ${CUBLAS_PATHS}
+  DOC "cuBLAS library"
+)
+mark_as_advanced(CUBLAS_LIBRARIES)
+
+# ==================================================================================================
+
+# Notification messages
+if(NOT CUBLAS_INCLUDE_DIRS)
+    message(STATUS "Could NOT find 'cuBLAS.h', install CUDA/cuBLAS or set CUDA_ROOT")
+endif()
+if(NOT CUDA_LIBRARIES)
+    message(STATUS "Could NOT find CUDA library, install it or set CUDA_ROOT")
+endif()
+if(NOT CUBLAS_LIBRARIES)
+    message(STATUS "Could NOT find cuBLAS library, install it or set CUDA_ROOT")
+endif()
+
+# Determines whether or not cuBLAS was found
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(cuBLAS DEFAULT_MSG CUBLAS_INCLUDE_DIRS CUDA_LIBRARIES CUBLAS_LIBRARIES)
+
+# ==================================================================================================
--- a/include/test.h
+++ b/include/test.h
+#include "cuda_runtime.h"
+#include <iostream>
+
+__global__ void test(void);
+__global__ void add(float *x, float *y, float *z, int n);
+
+__global__ void MatMul(
+        float* matPtr_a,
+        float* matPtr_b,
+        float* matPtr_c,
+        int m,
+        int k,
+        int n
+        );
--- a/main.cu
+++ b/main.cu
+#include "test.h"
+#include "device_launch_parameters.h"
+#include "cuda_runtime.h"
+
+template <typename T>
+void printMat(const T &mat, 
+              const int &m, 
+              const int &n) {
+    std::cout << "\n";
+    for (int i = 0; i < m; i++)  {
+        for (int j = 0; j < n; j++) {
+            std::cout << *(mat + i * n + j) << " ";
+        }
+        std::cout << "\n";
+    }
+}
+
+int main(void) {
+    int dev = 0;
+    cudaDeviceProp devProp;
+    // CHECK(cudaGetDeviceProperties(&devProp, dev));
+    cudaGetDeviceProperties(&devProp, dev);
+    std::cout << "---------------------------------------------------------" << std::endl;
+    std::cout << "GPU device:                           | " << devProp.name << std::endl;
+    std::cout << "SM num:                               | " << devProp.multiProcessorCount << std::endl;
+    std::cout << "Share Memory Per Block:               | " << devProp.sharedMemPerBlock / 1024 << " KB" << std::endl;
+    std::cout << "Max Threads Per Block:                | " << devProp.maxThreadsPerBlock << std::endl;
+    std::cout << "Max Threads Per MultiProcessor:       | " << devProp.maxThreadsPerMultiProcessor << std::endl;
+    std::cout << "Max Threads Batch Per MultiProcessor: | " << devProp.maxThreadsPerMultiProcessor / 32 << std::endl;
+    std::cout << "---------------------------------------------------------" << std::endl;
+
+    int m = 4;
+    int k = 4;
+    int n = 4;
+    int nBytes_a = m * k * sizeof(float);
+    int nBytes_b = k * n * sizeof(float);
+    int nBytes_c = m * n * sizeof(float);
+
+    // use Unified Memory.
+    float *mat_a, *mat_b, *mat_c;
+    // cudaMallocManaged((void **)&mat_a, nBytes_a);
+    // cudaMallocManaged((void **)&mat_b, nBytes_b);
+    // cudaMallocManaged((void **)&mat_c, nBytes_c);
+    mat_a = (float *)malloc(nBytes_a);
+    mat_b = (float *)malloc(nBytes_b);
+    mat_c = (float *)malloc(nBytes_c);
+    
+    for (int i = 0; i < m * k; i++) {
+        mat_a[i] = i;
+    }
+    printMat(mat_a, m, k);
+
+    for (int i = 0; i < k * n; i++) {
+        mat_b[i] = i;
+    }
+    printMat(mat_b, k, n);
+
+    for (int i = 0; i < m * n; i++) {
+        mat_c[i] = 0.0;
+    }
+
+    float *d_a, *d_b, *d_c;
+    cudaMalloc((void **)&d_a, nBytes_a);
+    cudaMalloc((void **)&d_b, nBytes_b);
+    cudaMalloc((void **)&d_c, nBytes_c);
+
+    cudaMemcpy((void *)d_a, (void *)mat_a, nBytes_a, cudaMemcpyHostToDevice);
+    cudaMemcpy((void *)d_b, (void *)mat_b, nBytes_b, cudaMemcpyHostToDevice);
+    cudaMemcpy((void *)d_c, (void *)mat_c, nBytes_c, cudaMemcpyHostToDevice);
+
+    // dim3 blockSize(k);
+    dim3 blockSize(1);
+    dim3 gridSize(m, n);
+
+    // add<<<gridSize, blockSize>>>(x, y, z, N);
+    MatMul<<<gridSize, blockSize>>>(d_a, d_b, d_c, m, k, n);
+    cudaError_t cudaStatus = cudaGetLastError();
+    if (cudaStatus != cudaSuccess) {
+        std::cout << "cudaError: " << cudaGetErrorString(cudaStatus) << std::endl;
+    }
+    cudaMemcpy((void *)mat_c, (void *)d_c, nBytes_c, cudaMemcpyDeviceToHost);
+    cudaDeviceSynchronize();
+    printMat(mat_c, m, n);
+    
+    float maxError = 0.0;
+    for (int i = 0; i < m * n; i++) {
+        maxError = fmax(maxError, fabs(*(mat_c + i) - 100.0));
+    }
+    std::cout << "Max Error: " << maxError << std::endl;
+
+    cudaFree(d_a);
+    cudaFree(d_b);
+    cudaFree(d_c);
+
+    free(mat_a);
+    free(mat_b);
+    free(mat_c);
+
+    test<<<1, 1>>>();
+    std::cout << "Hello CUDA!" << std::endl;
+    return 0;
+}
--- a/src/test.cu
+++ b/src/test.cu
+#include "test.h"
+// #include <iostream>
+#include <stdio.h>
+
+__global__ void test(void) {
+    // std::cout << "Hello CUDA!" << std::endl;
+    printf("In the cuda function!\n");
+}
+
+__global__ void add(float *x, float *y, float *z, int n) {
+    int index = threadIdx.x + blockIdx.x * blockDim.x;
+    int stride = blockDim.x * gridDim.x;
+    for (int i = index; i < n; i += stride) {
+        z[i] = x[i] + y[i];
+    }
+}
+
+// c = a + b
+// a[i][j] = i * raw + j
+// 2-dim: (m, k) * (k, n) = (m, n)
+__global__ void MatMul(
+        float* matPtr_a,
+        float* matPtr_b,
+        float* matPtr_c,
+        int m,
+        int k,
+        int n
+        ) {
+    int c_row = blockIdx.x;
+    int c_col = blockIdx.y;
+    /*
+    // printf("blockIdx.x = %d, blockIdx.y = %d, threadIdx.x = %d\n", c_row, c_col, threadIdx.x);
+    printf("a[%d][%d] = %f\n", c_row, threadIdx.x, *(matPtr_a + c_row * k + threadIdx.x));
+    // printf("c[%d][%d] = %f\n", c_row, c_col, *(matPtr_c));
+    printf("b[%d][%d] = %f\n", threadIdx.x, c_col, *(matPtr_b + threadIdx.x * n + c_col));
+    // printf("id = %d\n", c_col + threadIdx.x * n);
+    *(matPtr_c + c_row * n + c_col) += 
+    *(matPtr_a + c_row * k + threadIdx.x) * *(matPtr_b + threadIdx.x * n + c_col);
+    printf("c[%d][%d] = %f\n", c_row, c_col, *(matPtr_c + c_row * n + c_col));
+    // std::cout << *(matPtr_c + c_row * m + c_col) << std::endl;
+    */
+    for (int i = 0; i < k; i++) {
+        *(matPtr_c + c_row * n + c_col) += 
+        *(matPtr_a + c_row * k + i) * *(matPtr_b + i * n + c_col);
+    }
+}