GPU matmul refactoring and optimization (#53)

* FMatmul refactoring * HAVE_CUBLAS is returned * fix: Uses gpu_alloc.h methods to allocate VRAM. fix: Instead of copying the data, it frees the result buffer and overwrites the pointer with the deviceResult address. fix: gpu_alloc leak typo. --------- Co-authored-by: Henrique Borba <[email protected]>
NumPower · Jul 8, 2024 · 11dbea0 · 11dbea0
1 parent cb8e7ec
commit 11dbea0
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 20 deletions.
diff --git a/src/gpu_alloc.c b/src/gpu_alloc.c
@@ -35,7 +35,7 @@ vfree(void* target) {
 void
 vmemcheck() {
     if (MAIN_MEM_STACK.totalGPUAllocated != 0) {
-        printf("\nVRAM MEMORY LEAK: Unallocated %d arrays\n", MAIN_MEM_STACK.totalGPUAllocated);
+        printf("\nVRAM MEMORY LEAK: leaked %d array(s)\n", MAIN_MEM_STACK.totalGPUAllocated);
     }
 }
 

diff --git a/src/ndmath/linalg.c b/src/ndmath/linalg.c
@@ -55,32 +55,19 @@ NDArray_FMatmul(NDArray *a, NDArray *b) {
         cublasHandle_t handle;
         cublasCreate(&handle);
 
-        float* d_A;
-        float* d_B;
-        float* d_C;
-        size_t size_A = NDArray_NUMELEMENTS(a) * sizeof(float);
-        size_t size_B = NDArray_NUMELEMENTS(b) * sizeof(float);
-        size_t size_C = NDArray_NUMELEMENTS(result) * sizeof(float);
-
-        cudaMalloc((void**)&d_A, size_A);
-        cudaMalloc((void**)&d_B, size_B);
-        cudaMalloc((void**)&d_C, size_C);
-
-        cudaMemcpy(d_A, NDArray_FDATA(a), size_A, cudaMemcpyHostToDevice);
-        cudaMemcpy(d_B, NDArray_FDATA(b), size_B, cudaMemcpyHostToDevice);
+        float* deviceResult;
+        size_t sizeResult = NDArray_NUMELEMENTS(result) * sizeof(float);
 
+        vmalloc((void**)&deviceResult, sizeResult);
         int m = NDArray_SHAPE(a)[0];
         int n = NDArray_SHAPE(b)[1];
         int k = NDArray_SHAPE(a)[1];
         float alpha = 1.0f;
         float beta = 0.0f;
 
-        cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, d_B, n, d_A, k, &beta, d_C, n);
-        cudaMemcpy(NDArray_FDATA(result), d_C, size_C, cudaMemcpyDeviceToHost);
-
-        cudaFree(d_A);
-        cudaFree(d_B);
-        cudaFree(d_C);
+        cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, NDArray_FDATA(b), n, NDArray_FDATA(a), k, &beta, deviceResult, n);
+        vfree(result->data);
+        result->data = (void*)deviceResult;
         cublasDestroy(handle);
 #endif
     } else {