Skip to content

Commit

Permalink
GPU matmul refactoring and optimization (#53)
Browse files Browse the repository at this point in the history
* FMatmul refactoring

* HAVE_CUBLAS is returned

* fix: Uses gpu_alloc.h methods to allocate VRAM.

fix: Instead of copying the data, it frees the result buffer and overwrites the pointer with the deviceResult address.

fix: gpu_alloc leak typo.

---------

Co-authored-by: Henrique Borba <[email protected]>
  • Loading branch information
SkibidiProduction and henrique-borba authored Jul 8, 2024
1 parent cb8e7ec commit 11dbea0
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 20 deletions.
2 changes: 1 addition & 1 deletion src/gpu_alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ vfree(void* target) {
void
vmemcheck() {
if (MAIN_MEM_STACK.totalGPUAllocated != 0) {
printf("\nVRAM MEMORY LEAK: Unallocated %d arrays\n", MAIN_MEM_STACK.totalGPUAllocated);
printf("\nVRAM MEMORY LEAK: leaked %d array(s)\n", MAIN_MEM_STACK.totalGPUAllocated);
}
}

Expand Down
25 changes: 6 additions & 19 deletions src/ndmath/linalg.c
Original file line number Diff line number Diff line change
Expand Up @@ -55,32 +55,19 @@ NDArray_FMatmul(NDArray *a, NDArray *b) {
cublasHandle_t handle;
cublasCreate(&handle);

float* d_A;
float* d_B;
float* d_C;
size_t size_A = NDArray_NUMELEMENTS(a) * sizeof(float);
size_t size_B = NDArray_NUMELEMENTS(b) * sizeof(float);
size_t size_C = NDArray_NUMELEMENTS(result) * sizeof(float);

cudaMalloc((void**)&d_A, size_A);
cudaMalloc((void**)&d_B, size_B);
cudaMalloc((void**)&d_C, size_C);

cudaMemcpy(d_A, NDArray_FDATA(a), size_A, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, NDArray_FDATA(b), size_B, cudaMemcpyHostToDevice);
float* deviceResult;
size_t sizeResult = NDArray_NUMELEMENTS(result) * sizeof(float);

vmalloc((void**)&deviceResult, sizeResult);
int m = NDArray_SHAPE(a)[0];
int n = NDArray_SHAPE(b)[1];
int k = NDArray_SHAPE(a)[1];
float alpha = 1.0f;
float beta = 0.0f;

cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, d_B, n, d_A, k, &beta, d_C, n);
cudaMemcpy(NDArray_FDATA(result), d_C, size_C, cudaMemcpyDeviceToHost);

cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k, &alpha, NDArray_FDATA(b), n, NDArray_FDATA(a), k, &beta, deviceResult, n);
vfree(result->data);
result->data = (void*)deviceResult;
cublasDestroy(handle);
#endif
} else {
Expand Down

0 comments on commit 11dbea0

Please sign in to comment.