Commit 75c7d05f by lvzhengyang

enlarge the matrix size

parent b693f659
#include "test.h" #include "test.h"
#include "device_launch_parameters.h" #include "device_launch_parameters.h"
#include "cuda_runtime.h" #include "cuda_runtime.h"
#include <ctime>
#include "chrono"
using namespace std::chrono;
template <typename T> template <typename T>
void printMat(const T &mat, void printMat(const T &mat,
...@@ -29,9 +33,9 @@ int main(void) { ...@@ -29,9 +33,9 @@ int main(void) {
std::cout << "Max Threads Batch Per MultiProcessor: | " << devProp.maxThreadsPerMultiProcessor / 32 << std::endl; std::cout << "Max Threads Batch Per MultiProcessor: | " << devProp.maxThreadsPerMultiProcessor / 32 << std::endl;
std::cout << "---------------------------------------------------------" << std::endl; std::cout << "---------------------------------------------------------" << std::endl;
int m = 4; int m = 1000;
int k = 4; int k = 600;
int n = 4; int n = 1000;
int nBytes_a = m * k * sizeof(float); int nBytes_a = m * k * sizeof(float);
int nBytes_b = k * n * sizeof(float); int nBytes_b = k * n * sizeof(float);
int nBytes_c = m * n * sizeof(float); int nBytes_c = m * n * sizeof(float);
...@@ -46,14 +50,14 @@ int main(void) { ...@@ -46,14 +50,14 @@ int main(void) {
mat_c = (float *)malloc(nBytes_c); mat_c = (float *)malloc(nBytes_c);
for (int i = 0; i < m * k; i++) { for (int i = 0; i < m * k; i++) {
mat_a[i] = i; mat_a[i] = 10;
} }
printMat(mat_a, m, k); // printMat(mat_a, m, k);
for (int i = 0; i < k * n; i++) { for (int i = 0; i < k * n; i++) {
mat_b[i] = i; mat_b[i] = 10;
} }
printMat(mat_b, k, n); // printMat(mat_b, k, n);
for (int i = 0; i < m * n; i++) { for (int i = 0; i < m * n; i++) {
mat_c[i] = 0.0; mat_c[i] = 0.0;
...@@ -64,6 +68,8 @@ int main(void) { ...@@ -64,6 +68,8 @@ int main(void) {
cudaMalloc((void **)&d_b, nBytes_b); cudaMalloc((void **)&d_b, nBytes_b);
cudaMalloc((void **)&d_c, nBytes_c); cudaMalloc((void **)&d_c, nBytes_c);
auto start = system_clock::now();
cudaMemcpy((void *)d_a, (void *)mat_a, nBytes_a, cudaMemcpyHostToDevice); cudaMemcpy((void *)d_a, (void *)mat_a, nBytes_a, cudaMemcpyHostToDevice);
cudaMemcpy((void *)d_b, (void *)mat_b, nBytes_b, cudaMemcpyHostToDevice); cudaMemcpy((void *)d_b, (void *)mat_b, nBytes_b, cudaMemcpyHostToDevice);
cudaMemcpy((void *)d_c, (void *)mat_c, nBytes_c, cudaMemcpyHostToDevice); cudaMemcpy((void *)d_c, (void *)mat_c, nBytes_c, cudaMemcpyHostToDevice);
...@@ -78,15 +84,24 @@ int main(void) { ...@@ -78,15 +84,24 @@ int main(void) {
if (cudaStatus != cudaSuccess) { if (cudaStatus != cudaSuccess) {
std::cout << "cudaError: " << cudaGetErrorString(cudaStatus) << std::endl; std::cout << "cudaError: " << cudaGetErrorString(cudaStatus) << std::endl;
} }
cudaMemcpy((void *)mat_c, (void *)d_c, nBytes_c, cudaMemcpyDeviceToHost); cudaMemcpy((void *)mat_c, (void *)d_c, nBytes_c, cudaMemcpyDeviceToHost);
cudaDeviceSynchronize(); cudaDeviceSynchronize();
printMat(mat_c, m, n);
auto end = system_clock::now();
auto duration = duration_cast<microseconds>(end - start);
std::cout << "GPU time: "
<< double(duration.count()) * microseconds::period::num / microseconds::period::den << " seconds" << std::endl;
// printMat(mat_c, m, n);
/*
float maxError = 0.0; float maxError = 0.0;
for (int i = 0; i < m * n; i++) { for (int i = 0; i < m * n; i++) {
maxError = fmax(maxError, fabs(*(mat_c + i) - 100.0)); maxError = fmax(maxError, fabs(*(mat_c + i) - 100.0));
} }
std::cout << "Max Error: " << maxError << std::endl; std::cout << "Max Error: " << maxError << std::endl;
*/
cudaFree(d_a); cudaFree(d_a);
cudaFree(d_b); cudaFree(d_b);
...@@ -96,7 +111,7 @@ int main(void) { ...@@ -96,7 +111,7 @@ int main(void) {
free(mat_b); free(mat_b);
free(mat_c); free(mat_c);
test<<<1, 1>>>(); // test<<<1, 1>>>();
std::cout << "Hello CUDA!" << std::endl; // std::cout << "Hello CUDA!" << std::endl;
return 0; return 0;
} }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment