Commit 9adab13b by lvzhengyang

initial commit

parents
cmake_minimum_required(VERSION 3.14)
project(CUDA_MAT_MUL LANGUAGES CXX CUDA)
# set(cuBLAS_DIR .)
# find_package(cuBLAS)
set(CMAKE_CXX_STANDARD 11)
if(MSVC)
add_compile_options("/std:c++latest")
endif()
add_definitions(-D DEBUG)
set(CUDA_NVCC_FLAGS -G;-g)
include_directories(include)
# file(GLOB_RECURSE eden_sourcefile "src/*.cpp")
aux_source_directory(./src/ sourcefile)
add_library(lib ${sourcefile})
add_executable(cudaMatMul main.cu)
# target_link_libraries(cudaMatMul PRIVATE ${CUBLAS_LIBRARES})
target_link_libraries(cudaMatMul lib)
# CUDA Demo
* This project provides a project structure for cuda project.
* To build the project, just run `mkdir build && cd build && cmake .. && make `
### Created by
Lv Zhengyang (吕政阳)
# ==================================================================================================
# This file is part of the cuBLASt project. The project is licensed under Apache Version 2.0. This
# project loosely follows the Google C++ styleguide and uses a tab-size of two spaces and a max-
# width of 100 characters per line.
#
# Author(s):
# Cedric Nugteren <www.cedricnugteren.nl>
#
# ==================================================================================================
#
# Defines the following variables:
# CUBLAS_FOUND Boolean holding whether or not the cuBLAS library was found
# CUBLAS_INCLUDE_DIRS The CUDA and cuBLAS include directory
# CUDA_LIBRARIES The CUDA library
# CUBLAS_LIBRARIES The cuBLAS library
#
# In case CUDA is not installed in the default directory, set the CUDA_ROOT variable to point to
# the root of cuBLAS, such that 'cublas_v2.h' can be found in $CUDA_ROOT/include. This can either be
# done using an environmental variable (e.g. export CUDA_ROOT=/path/to/cuBLAS) or using a CMake
# variable (e.g. cmake -DCUDA_ROOT=/path/to/cuBLAS ..).
#
# ==================================================================================================
# Sets the possible install locations
set(CUBLAS_HINTS
${CUDA_ROOT}
$ENV{CUDA_ROOT}
$ENV{CUDA_TOOLKIT_ROOT_DIR}
)
set(CUBLAS_PATHS
/usr
/usr/local
/usr/local/cuda
)
# Finds the include directories
find_path(CUBLAS_INCLUDE_DIRS
NAMES cublas_v2.h cuda.h
HINTS ${CUBLAS_HINTS}
PATH_SUFFIXES include inc include/x86_64 include/x64
PATHS ${CUBLAS_PATHS}
DOC "cuBLAS include header cublas_v2.h"
)
mark_as_advanced(CUBLAS_INCLUDE_DIRS)
# Finds the libraries
find_library(CUDA_LIBRARIES
NAMES cudart
HINTS ${CUBLAS_HINTS}
PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import
PATHS ${CUBLAS_PATHS}
DOC "CUDA library"
)
mark_as_advanced(CUDA_LIBRARIES)
find_library(CUBLAS_LIBRARIES
NAMES cublas
HINTS ${CUBLAS_HINTS}
PATH_SUFFIXES lib lib64 lib/x86_64 lib/x64 lib/x86 lib/Win32 lib/import lib64/import
PATHS ${CUBLAS_PATHS}
DOC "cuBLAS library"
)
mark_as_advanced(CUBLAS_LIBRARIES)
# ==================================================================================================
# Notification messages
if(NOT CUBLAS_INCLUDE_DIRS)
message(STATUS "Could NOT find 'cuBLAS.h', install CUDA/cuBLAS or set CUDA_ROOT")
endif()
if(NOT CUDA_LIBRARIES)
message(STATUS "Could NOT find CUDA library, install it or set CUDA_ROOT")
endif()
if(NOT CUBLAS_LIBRARIES)
message(STATUS "Could NOT find cuBLAS library, install it or set CUDA_ROOT")
endif()
# Determines whether or not cuBLAS was found
include(FindPackageHandleStandardArgs)
find_package_handle_standard_args(cuBLAS DEFAULT_MSG CUBLAS_INCLUDE_DIRS CUDA_LIBRARIES CUBLAS_LIBRARIES)
# ==================================================================================================
#include "cuda_runtime.h"
#include <iostream>
__global__ void test(void);
__global__ void add(float *x, float *y, float *z, int n);
__global__ void MatMul(
float* matPtr_a,
float* matPtr_b,
float* matPtr_c,
int m,
int k,
int n
);
#include "test.h"
#include "device_launch_parameters.h"
#include "cuda_runtime.h"
template <typename T>
void printMat(const T &mat,
const int &m,
const int &n) {
std::cout << "\n";
for (int i = 0; i < m; i++) {
for (int j = 0; j < n; j++) {
std::cout << *(mat + i * n + j) << " ";
}
std::cout << "\n";
}
}
int main(void) {
int dev = 0;
cudaDeviceProp devProp;
// CHECK(cudaGetDeviceProperties(&devProp, dev));
cudaGetDeviceProperties(&devProp, dev);
std::cout << "---------------------------------------------------------" << std::endl;
std::cout << "GPU device: | " << devProp.name << std::endl;
std::cout << "SM num: | " << devProp.multiProcessorCount << std::endl;
std::cout << "Share Memory Per Block: | " << devProp.sharedMemPerBlock / 1024 << " KB" << std::endl;
std::cout << "Max Threads Per Block: | " << devProp.maxThreadsPerBlock << std::endl;
std::cout << "Max Threads Per MultiProcessor: | " << devProp.maxThreadsPerMultiProcessor << std::endl;
std::cout << "Max Threads Batch Per MultiProcessor: | " << devProp.maxThreadsPerMultiProcessor / 32 << std::endl;
std::cout << "---------------------------------------------------------" << std::endl;
int m = 4;
int k = 4;
int n = 4;
int nBytes_a = m * k * sizeof(float);
int nBytes_b = k * n * sizeof(float);
int nBytes_c = m * n * sizeof(float);
// use Unified Memory.
float *mat_a, *mat_b, *mat_c;
// cudaMallocManaged((void **)&mat_a, nBytes_a);
// cudaMallocManaged((void **)&mat_b, nBytes_b);
// cudaMallocManaged((void **)&mat_c, nBytes_c);
mat_a = (float *)malloc(nBytes_a);
mat_b = (float *)malloc(nBytes_b);
mat_c = (float *)malloc(nBytes_c);
for (int i = 0; i < m * k; i++) {
mat_a[i] = i;
}
printMat(mat_a, m, k);
for (int i = 0; i < k * n; i++) {
mat_b[i] = i;
}
printMat(mat_b, k, n);
for (int i = 0; i < m * n; i++) {
mat_c[i] = 0.0;
}
float *d_a, *d_b, *d_c;
cudaMalloc((void **)&d_a, nBytes_a);
cudaMalloc((void **)&d_b, nBytes_b);
cudaMalloc((void **)&d_c, nBytes_c);
cudaMemcpy((void *)d_a, (void *)mat_a, nBytes_a, cudaMemcpyHostToDevice);
cudaMemcpy((void *)d_b, (void *)mat_b, nBytes_b, cudaMemcpyHostToDevice);
cudaMemcpy((void *)d_c, (void *)mat_c, nBytes_c, cudaMemcpyHostToDevice);
// dim3 blockSize(k);
dim3 blockSize(1);
dim3 gridSize(m, n);
// add<<<gridSize, blockSize>>>(x, y, z, N);
MatMul<<<gridSize, blockSize>>>(d_a, d_b, d_c, m, k, n);
cudaError_t cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
std::cout << "cudaError: " << cudaGetErrorString(cudaStatus) << std::endl;
}
cudaMemcpy((void *)mat_c, (void *)d_c, nBytes_c, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize();
printMat(mat_c, m, n);
float maxError = 0.0;
for (int i = 0; i < m * n; i++) {
maxError = fmax(maxError, fabs(*(mat_c + i) - 100.0));
}
std::cout << "Max Error: " << maxError << std::endl;
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
free(mat_a);
free(mat_b);
free(mat_c);
test<<<1, 1>>>();
std::cout << "Hello CUDA!" << std::endl;
return 0;
}
#include "test.h"
// #include <iostream>
#include <stdio.h>
__global__ void test(void) {
// std::cout << "Hello CUDA!" << std::endl;
printf("In the cuda function!\n");
}
__global__ void add(float *x, float *y, float *z, int n) {
int index = threadIdx.x + blockIdx.x * blockDim.x;
int stride = blockDim.x * gridDim.x;
for (int i = index; i < n; i += stride) {
z[i] = x[i] + y[i];
}
}
// c = a + b
// a[i][j] = i * raw + j
// 2-dim: (m, k) * (k, n) = (m, n)
__global__ void MatMul(
float* matPtr_a,
float* matPtr_b,
float* matPtr_c,
int m,
int k,
int n
) {
int c_row = blockIdx.x;
int c_col = blockIdx.y;
/*
// printf("blockIdx.x = %d, blockIdx.y = %d, threadIdx.x = %d\n", c_row, c_col, threadIdx.x);
printf("a[%d][%d] = %f\n", c_row, threadIdx.x, *(matPtr_a + c_row * k + threadIdx.x));
// printf("c[%d][%d] = %f\n", c_row, c_col, *(matPtr_c));
printf("b[%d][%d] = %f\n", threadIdx.x, c_col, *(matPtr_b + threadIdx.x * n + c_col));
// printf("id = %d\n", c_col + threadIdx.x * n);
*(matPtr_c + c_row * n + c_col) +=
*(matPtr_a + c_row * k + threadIdx.x) * *(matPtr_b + threadIdx.x * n + c_col);
printf("c[%d][%d] = %f\n", c_row, c_col, *(matPtr_c + c_row * n + c_col));
// std::cout << *(matPtr_c + c_row * m + c_col) << std::endl;
*/
for (int i = 0; i < k; i++) {
*(matPtr_c + c_row * n + c_col) +=
*(matPtr_a + c_row * k + i) * *(matPtr_b + i * n + c_col);
}
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment