|
| 1 | +#include <cuda_runtime.h> |
| 2 | +#include <stdio.h> |
| 3 | +#include <stdlib.h> |
| 4 | + |
| 5 | +#define TILE_SIZE 16 |
| 6 | + |
| 7 | +inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort = true) { |
| 8 | + if (code != cudaSuccess) { |
| 9 | + fprintf(stderr, "CUDA error: %s %s %d\n", cudaGetErrorString(code), file, line); |
| 10 | + if (abort) exit(code); |
| 11 | + } |
| 12 | +} |
| 13 | + |
| 14 | +__global__ void matrixMulKernel(float *A, float *B, float *C, int N) { |
| 15 | + __shared__ float sharedA[TILE_SIZE][TILE_SIZE]; |
| 16 | + __shared__ float sharedB[TILE_SIZE][TILE_SIZE]; |
| 17 | + |
| 18 | + int tx = threadIdx.x, ty = threadIdx.y; |
| 19 | + int row = blockIdx.y * TILE_SIZE + ty; |
| 20 | + int col = blockIdx.x * TILE_SIZE + tx; |
| 21 | + float sum = 0.0f; |
| 22 | + |
| 23 | + for (int t = 0; t < (N + TILE_SIZE - 1) / TILE_SIZE; t++) { |
| 24 | + if (row < N && t * TILE_SIZE + tx < N) |
| 25 | + sharedA[ty][tx] = A[row * N + t * TILE_SIZE + tx]; |
| 26 | + else |
| 27 | + sharedA[ty][tx] = 0.0f; |
| 28 | + |
| 29 | + if (col < N && t * TILE_SIZE + ty < N) |
| 30 | + sharedB[ty][tx] = B[(t * TILE_SIZE + ty) * N + col]; |
| 31 | + else |
| 32 | + sharedB[ty][tx] = 0.0f; |
| 33 | + |
| 34 | + __syncthreads(); |
| 35 | + |
| 36 | + for (int k = 0; k < TILE_SIZE; k++) { |
| 37 | + sum += sharedA[ty][k] * sharedB[k][tx]; |
| 38 | + } |
| 39 | + |
| 40 | + __syncthreads(); |
| 41 | + } |
| 42 | + |
| 43 | + if (row < N && col < N) |
| 44 | + C[row * N + col] = sum; |
| 45 | +} |
| 46 | + |
| 47 | +void matrixMultiply(float *h_A, float *h_B, float *h_C, int N) { |
| 48 | + float *d_A, *d_B, *d_C; |
| 49 | + size_t size = N * N * sizeof(float); |
| 50 | + |
| 51 | + cudaMalloc(&d_A, size); |
| 52 | + cudaMalloc(&d_B, size); |
| 53 | + cudaMalloc(&d_C, size); |
| 54 | + |
| 55 | + cudaMemcpy(d_A, h_A, size, cudaMemcpyHostToDevice); |
| 56 | + cudaMemcpy(d_B, h_B, size, cudaMemcpyHostToDevice); |
| 57 | + |
| 58 | + dim3 blockSize(TILE_SIZE, TILE_SIZE); |
| 59 | + dim3 gridSize((N + TILE_SIZE - 1) / TILE_SIZE, (N + TILE_SIZE - 1) / TILE_SIZE); |
| 60 | + |
| 61 | + matrixMulKernel<<<gridSize, blockSize>>>(d_A, d_B, d_C, N); |
| 62 | + |
| 63 | + cudaDeviceSynchronize(); |
| 64 | + gpuAssert(cudaGetLastError(), __FILE__, __LINE__); |
| 65 | + |
| 66 | + cudaMemcpy(h_C, d_C, size, cudaMemcpyDeviceToHost); |
| 67 | + |
| 68 | + cudaFree(d_A); |
| 69 | + cudaFree(d_B); |
| 70 | + cudaFree(d_C); |
| 71 | +} |
| 72 | + |
| 73 | +int main() { |
| 74 | + int N = 64; // Matrix size |
| 75 | + size_t size = N * N * sizeof(float); |
| 76 | + |
| 77 | + float *h_A = (float *)malloc(size); |
| 78 | + float *h_B = (float *)malloc(size); |
| 79 | + float *h_C = (float *)malloc(size); |
| 80 | + |
| 81 | + for (int i = 0; i < N * N; i++) { |
| 82 | + h_A[i] = rand() % 10; |
| 83 | + h_B[i] = rand() % 10; |
| 84 | + } |
| 85 | + |
| 86 | + matrixMultiply(h_A, h_B, h_C, N); |
| 87 | + |
| 88 | + printf("Result matrix:\n"); |
| 89 | + for (int i = 0; i < N; i++) { |
| 90 | + for (int j = 0; j < N; j++) { |
| 91 | + printf("%0.1f ", h_C[i * N + j]); |
| 92 | + } |
| 93 | + printf("\n"); |
| 94 | + } |
| 95 | + |
| 96 | + free(h_A); |
| 97 | + free(h_B); |
| 98 | + free(h_C); |
| 99 | + |
| 100 | + return 0; |
| 101 | +} |
0 commit comments