diff --git a/src/gpu_alloc.c b/src/gpu_alloc.c index 46b8119..2bf111f 100644 --- a/src/gpu_alloc.c +++ b/src/gpu_alloc.c @@ -35,7 +35,7 @@ vfree(void* target) { void vmemcheck() { if (MAIN_MEM_STACK.totalGPUAllocated != 0) { - printf("\nVRAM MEMORY LEAK: Unallocated %d arrays\n", MAIN_MEM_STACK.totalGPUAllocated); + printf("\nVRAM MEMORY LEAK: leaked %d array(s)\n", MAIN_MEM_STACK.totalGPUAllocated); } } diff --git a/src/ndmath/linalg.c b/src/ndmath/linalg.c index fe85d94..c8bfd69 100644 --- a/src/ndmath/linalg.c +++ b/src/ndmath/linalg.c @@ -55,32 +55,19 @@ NDArray_FMatmul(NDArray *a, NDArray *b) { cublasHandle_t handle; cublasCreate(&handle); - float* d_A; - float* d_B; - float* d_C; - size_t size_A = NDArray_NUMELEMENTS(a) * sizeof(float); - size_t size_B = NDArray_NUMELEMENTS(b) * sizeof(float); - size_t size_C = NDArray_NUMELEMENTS(result) * sizeof(float); - - cudaMalloc((void**)&d_A, size_A); - cudaMalloc((void**)&d_B, size_B); - cudaMalloc((void**)&d_C, size_C); - - cudaMemcpy(d_A, NDArray_FDATA(a), size_A, cudaMemcpyHostToDevice); - cudaMemcpy(d_B, NDArray_FDATA(b), size_B, cudaMemcpyHostToDevice); + float* deviceResult; + size_t sizeResult = NDArray_NUMELEMENTS(result) * sizeof(float); + vmalloc((void**)&deviceResult, sizeResult); int m = NDArray_SHAPE(a)[0]; int n = NDArray_SHAPE(b)[1]; int k = NDArray_SHAPE(a)[1]; float alpha = 1.0f; float beta = 0.0f; - cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, d_B, n, d_A, k, &beta, d_C, n); - cudaMemcpy(NDArray_FDATA(result), d_C, size_C, cudaMemcpyDeviceToHost); - - cudaFree(d_A); - cudaFree(d_B); - cudaFree(d_C); + cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, NDArray_FDATA(b), n, NDArray_FDATA(a), k, &beta, deviceResult, n); + vfree(result->data); + result->data = (void*)deviceResult; cublasDestroy(handle); #endif } else {