enlarge the matrix size

75c7d05f · lvzhengyang · b693f659 · 75c7d05f
Commit 75c7d05f authored Nov 13, 2021 by lvzhengyang
Show whitespace changes
Inline Side-by-side

Showing with 25 additions and 10 deletions

main.cu
+25 -10

No files found.
--- a/main.cu
+++ b/main.cu
 #include "test.h"
 #include "device_launch_parameters.h"
 #include "cuda_runtime.h"
+#include <ctime>
+#include "chrono"
+using namespace std::chrono;
 template <typename T>
 void printMat(const T &mat, 
@@ -29,9 +33,9 @@ int main(void) {
    std::cout << "Max Threads Batch Per MultiProcessor: | " << devProp.maxThreadsPerMultiProcessor / 32 << std::endl;
    std::cout << "---------------------------------------------------------" << std::endl;
-    int m = 4;
+    int m = 1000;
-    int k = 4;
+    int k = 600;
-    int n = 4;
+    int n = 1000;
    int nBytes_a = m * k * sizeof(float);
    int nBytes_b = k * n * sizeof(float);
    int nBytes_c = m * n * sizeof(float);
@@ -46,14 +50,14 @@ int main(void) {
    mat_c = (float *)malloc(nBytes_c);
    for (int i = 0; i < m * k; i++) {
-        mat_a[i] = i;
+        mat_a[i] = 10;
    }
-    printMat(mat_a, m, k);
+    // printMat(mat_a, m, k);
    for (int i = 0; i < k * n; i++) {
-        mat_b[i] = i;
+        mat_b[i] = 10;
    }
-    printMat(mat_b, k, n);
+    // printMat(mat_b, k, n);
    for (int i = 0; i < m * n; i++) {
        mat_c[i] = 0.0;
@@ -64,6 +68,8 @@ int main(void) {
    cudaMalloc((void **)&d_b, nBytes_b);
    cudaMalloc((void **)&d_c, nBytes_c);
+    auto start = system_clock::now();
    cudaMemcpy((void *)d_a, (void *)mat_a, nBytes_a, cudaMemcpyHostToDevice);
    cudaMemcpy((void *)d_b, (void *)mat_b, nBytes_b, cudaMemcpyHostToDevice);
    cudaMemcpy((void *)d_c, (void *)mat_c, nBytes_c, cudaMemcpyHostToDevice);
@@ -78,15 +84,24 @@ int main(void) {
    if (cudaStatus != cudaSuccess) {
        std::cout << "cudaError: " << cudaGetErrorString(cudaStatus) << std::endl;
    }
    cudaMemcpy((void *)mat_c, (void *)d_c, nBytes_c, cudaMemcpyDeviceToHost);
    cudaDeviceSynchronize();
-    printMat(mat_c, m, n);
+    auto end = system_clock::now();
+    auto duration = duration_cast<microseconds>(end - start);
+    std::cout << "GPU time: "
+         << double(duration.count()) * microseconds::period::num / microseconds::period::den << " seconds" << std::endl;
+    // printMat(mat_c, m, n);
+    /* 
    float maxError = 0.0;
    for (int i = 0; i < m * n; i++) {
        maxError = fmax(maxError, fabs(*(mat_c + i) - 100.0));
    }
    std::cout << "Max Error: " << maxError << std::endl;
+    */
    cudaFree(d_a);
    cudaFree(d_b);
@@ -96,7 +111,7 @@ int main(void) {
    free(mat_b);
    free(mat_c);
-    test<<<1, 1>>>();
+    // test<<<1, 1>>>();
-    std::cout << "Hello CUDA!" << std::endl;
+    // std::cout << "Hello CUDA!" << std::endl;
    return 0;
 }