1
+ #include < stdlib.h>
2
+ #include < stdio.h>
3
+ #include < time.h>
4
+ #include < cuda.h>
5
+ #include < cublas_v2.h>
6
+
7
+ #define BLOCKSIZE 16
8
+
9
+ // CUDA ERROR CHECK
10
+ #define CUDA_CHECK (call ) \
11
+ do \
12
+ { \
13
+ cudaError_t err = (call); \
14
+ if (err != cudaSuccess) \
15
+ { \
16
+ fprintf (stderr, " CUDA error at %s:%d: %s\n " , \
17
+ __FILE__, __LINE__, cudaGetErrorString (err)); \
18
+ exit (1 ); \
19
+ } \
20
+ } while (0 )
21
+
22
+ void matmat_seq (double *A, double *B, double *C, const int N)
23
+ {
24
+ double sum = 0.0 ;
25
+ for (int i = 0 ; i < N; i++)
26
+ {
27
+ for (int j = 0 ; j < N; j++)
28
+ {
29
+ sum = 0.0 ;
30
+ for (int k = 0 ; k < N; k++)
31
+ {
32
+ sum += A[i * N + k] * B[k * N + j];
33
+ }
34
+ C[i * N + j] = sum;
35
+ }
36
+ }
37
+ }
38
+
39
+
40
+ __global__ void matmat_naive_kernel (double *A, double *B, double *C, const int N)
41
+ {
42
+ int i = threadIdx .y + blockDim .y * blockIdx .y ;
43
+ int j = threadIdx .x + blockDim .x * blockIdx .x ;
44
+
45
+ if (i < N && j < N)
46
+ {
47
+ double sum = 0.0 ;
48
+ for (int k = 0 ; k < N; k++)
49
+ {
50
+ sum += A[i * N + k] * B[k * N + j];
51
+ }
52
+ C[i * N + j] = sum;
53
+ }
54
+ }
55
+
56
+ __global__ void matmat_shared_kernel (double *A, double *B, double *C, const int N)
57
+ {
58
+ int i = threadIdx .y + blockDim .y * blockIdx .y ;
59
+ int j = threadIdx .x + blockDim .x * blockIdx .x ;
60
+
61
+ __shared__ double A_s[BLOCKSIZE][BLOCKSIZE];
62
+ __shared__ double B_s[BLOCKSIZE][BLOCKSIZE];
63
+
64
+ double sum = 0.0 ;
65
+ int blocks = (N + BLOCKSIZE - 1 ) / BLOCKSIZE;
66
+ // Loop in tiles
67
+ for (int block = 0 ; block < blocks; block++)
68
+ {
69
+ // Load A block to shared memory
70
+ int k = block * BLOCKSIZE + threadIdx .x ;
71
+ if (i < N && k < N)
72
+ {
73
+ A_s[threadIdx .y ][threadIdx .x ] = A[i * N + k];
74
+ }
75
+ else
76
+ {
77
+ A_s[threadIdx .y ][threadIdx .x ] = 0.0 ;
78
+ }
79
+
80
+ // Load B block to shared memory
81
+ k = block * BLOCKSIZE + threadIdx .y ;
82
+ if (k < N && j < N)
83
+ {
84
+ B_s[threadIdx .y ][threadIdx .x ] = B[k * N + j];
85
+ }
86
+ else
87
+ {
88
+ B_s[threadIdx .y ][threadIdx .x ] = 0.0 ;
89
+ }
90
+ // Wait for all threads to load A_s and B_s
91
+ __syncthreads ();
92
+
93
+ // Compute the block
94
+ // We don't need to check outbounds here
95
+ // because if we are out, the Element of A_s or B_s is 0.0
96
+ for (int kk = 0 ; kk < BLOCKSIZE; kk++)
97
+ {
98
+ sum += A_s[threadIdx .y ][kk] * B_s[kk][threadIdx .x ];
99
+ }
100
+ __syncthreads ();
101
+ }
102
+ if (i < N && j < N)
103
+ C[i * N + j] = sum;
104
+ }
105
+
106
+ void validation (double *h_C, double *C, const int N)
107
+ {
108
+ for (int i = 0 ; i < N; i++)
109
+ {
110
+ for (int j = 0 ; j < N; j++)
111
+ {
112
+ double err = fabs (h_C[i * N + j] - C[i * N + j]);
113
+ if (err > 1.0e-6 )
114
+ {
115
+ printf (" Error at C[%d][%d]: fabs( %f - %f ) = %e > %e\n " , i, j, h_C[i * N + j], C[i * N + j], err, 1.0e-6 );
116
+ exit (1 );
117
+ }
118
+ }
119
+ }
120
+ }
121
+
122
+ void copy_A_B_H2D (double *h_A, double *h_B, double *d_A, double *d_B, const size_t bytes,
123
+ cudaEvent_t *event_start, cudaEvent_t *event_end, float *total_time_ms, const char *case_name)
124
+ {
125
+ float time_ms = 0.0 ;
126
+ CUDA_CHECK (cudaEventRecord (*event_start));
127
+
128
+ CUDA_CHECK (cudaMemcpy (d_A, h_A, bytes, cudaMemcpyHostToDevice));
129
+ CUDA_CHECK (cudaMemcpy (d_B, h_B, bytes, cudaMemcpyHostToDevice));
130
+
131
+ CUDA_CHECK (cudaEventRecord (*event_end));
132
+ CUDA_CHECK (cudaEventSynchronize (*event_end));
133
+ CUDA_CHECK (cudaEventElapsedTime (&time_ms, *event_start, *event_end));
134
+ printf (" %s GPU H2D copy time: %.9f seconds\n " , case_name, time_ms / 1000 );
135
+ *total_time_ms += time_ms;
136
+ }
137
+
138
+ void copy_C_D2H (double *h_C, double *d_C, const size_t bytes,
139
+ cudaEvent_t *event_start, cudaEvent_t *event_end, float *total_time_ms, const char *case_name)
140
+ {
141
+ float time_ms = 0.0 ;
142
+ CUDA_CHECK (cudaEventRecord (*event_start));
143
+
144
+ CUDA_CHECK (cudaMemcpy (h_C, d_C, bytes, cudaMemcpyDeviceToHost));
145
+
146
+ CUDA_CHECK (cudaEventRecord (*event_end));
147
+ CUDA_CHECK (cudaEventSynchronize (*event_end));
148
+ CUDA_CHECK (cudaEventElapsedTime (&time_ms, *event_start, *event_end));
149
+ printf (" %s GPU D2H copy time: %.9f seconds\n " , case_name, time_ms / 1000 );
150
+ *total_time_ms += time_ms;
151
+ }
152
+
153
+ void init_C_gpu (double *h_C, double *d_C, const int N)
154
+ {
155
+ for (int i = 0 ; i < N; i++)
156
+ {
157
+ for (int j = 0 ; j < N; j++)
158
+ {
159
+ h_C[i * N + j] = -1.0 ;
160
+ }
161
+ }
162
+
163
+ CUDA_CHECK (cudaMemset (d_C, 0 , N * N * sizeof (double )));
164
+ }
165
+
166
+ int main (int argc, char *argv[])
167
+ {
168
+ // Argument parsing
169
+ if (argc != 3 )
170
+ {
171
+ printf (" Usage: %s <matrix size NxN> <check>\n " , argv[0 ]);
172
+ return 1 ;
173
+ }
174
+
175
+ int N = atoi (argv[1 ]);
176
+ int check = atoi (argv[2 ]);
177
+
178
+ printf (" Matrix size: %d x %d\n " , N, N);
179
+
180
+ //
181
+ // Memory allocation
182
+ //
183
+ // Host
184
+ size_t bytes = N * N * sizeof (double );
185
+ double *h_A = (double *)malloc (bytes);
186
+ double *h_B = (double *)malloc (bytes);
187
+ double *h_C = (double *)malloc (bytes);
188
+ double *C = (double *)malloc (bytes);
189
+
190
+ // Device
191
+ double *d_A, *d_B, *d_C;
192
+ CUDA_CHECK (cudaMalloc ((void **)&d_A, bytes));
193
+ CUDA_CHECK (cudaMalloc ((void **)&d_B, bytes));
194
+ CUDA_CHECK (cudaMalloc ((void **)&d_C, bytes));
195
+ CUDA_CHECK (cudaMemset (d_C, 0 , bytes)); // Init d_C to 0
196
+
197
+ //
198
+ // Matrices initialization
199
+ //
200
+ for (int i = 0 ; i < N; i++)
201
+ {
202
+ for (int j = 0 ; j < N; j++)
203
+ {
204
+ // Row-major
205
+ h_A[i * N + j] = drand48 ();
206
+ h_B[i * N + j] = drand48 ();
207
+ h_C[i * N + j] = -1.0 ;
208
+ C[i * N + j] = -1.0 ;
209
+ }
210
+ }
211
+
212
+ //
213
+ // Sequential
214
+ //
215
+ if (check)
216
+ {
217
+ struct timespec start, end;
218
+ clock_gettime (CLOCK_MONOTONIC, &start);
219
+
220
+ matmat_seq (h_A, h_B, C, N);
221
+
222
+ clock_gettime (CLOCK_MONOTONIC, &end);
223
+ double elapsed = (end.tv_sec - start.tv_sec ) + (end.tv_nsec - start.tv_nsec ) / 1.0e9 ;
224
+ printf (" Sequential elapsed time: %.9f seconds\n " , elapsed);
225
+ }
226
+ else
227
+ {
228
+ printf (" Sequential and validation deactivated\n " );
229
+ }
230
+
231
+ //
232
+ // GPU computations
233
+ //
234
+ cudaEvent_t event_start, event_end;
235
+ float time_ms = 0.0 ;
236
+ float total_time_ms = 0.0 ;
237
+ CUDA_CHECK (cudaEventCreate (&event_start));
238
+ CUDA_CHECK (cudaEventCreate (&event_end));
239
+
240
+ //
241
+ // Naive kernel
242
+ //
243
+ // Copy Host to Device
244
+ copy_A_B_H2D (h_A, h_B, d_A, d_B, bytes, &event_start, &event_end, &total_time_ms, " Naive" );
245
+
246
+ // Kernel launch
247
+ dim3 threads (BLOCKSIZE, BLOCKSIZE);
248
+ dim3 blocks ((N + BLOCKSIZE - 1 ) / BLOCKSIZE, (N + BLOCKSIZE - 1 ) / BLOCKSIZE);
249
+ CUDA_CHECK (cudaEventRecord (event_start));
250
+
251
+ matmat_naive_kernel<<<blocks, threads>>> (d_A, d_B, d_C, N);
252
+ CUDA_CHECK (cudaGetLastError ());
253
+ CUDA_CHECK (cudaDeviceSynchronize ());
254
+
255
+ CUDA_CHECK (cudaEventRecord (event_end));
256
+ CUDA_CHECK (cudaEventSynchronize (event_end));
257
+ CUDA_CHECK (cudaEventElapsedTime (&time_ms, event_start, event_end));
258
+ printf (" Naive GPU kernel time: %.9f seconds\n " , time_ms / 1000 );
259
+ total_time_ms += time_ms;
260
+ time_ms = 0.0 ;
261
+
262
+ // Copy Device to Host
263
+ copy_C_D2H (h_C, d_C, bytes, &event_start, &event_end, &total_time_ms, " Naive" );
264
+
265
+ printf (" Naive GPU total time: %.9f seconds\n " , total_time_ms / 1000 );
266
+ total_time_ms = 0.0 ;
267
+
268
+ // Validate
269
+ if (check)
270
+ validation (h_C, C, N);
271
+
272
+ //
273
+ // Shared memory kernel
274
+ //
275
+ init_C_gpu (h_C, d_C, N);
276
+ // Copy Host to Device
277
+ copy_A_B_H2D (h_A, h_B, d_A, d_B, bytes, &event_start, &event_end, &total_time_ms, " Shared" );
278
+
279
+ // Kernel launch
280
+ CUDA_CHECK (cudaEventRecord (event_start));
281
+ matmat_shared_kernel<<<blocks, threads>>> (d_A, d_B, d_C, N);
282
+ CUDA_CHECK (cudaGetLastError ());
283
+ CUDA_CHECK (cudaDeviceSynchronize ());
284
+
285
+ CUDA_CHECK (cudaEventRecord (event_end));
286
+ CUDA_CHECK (cudaEventSynchronize (event_end));
287
+ CUDA_CHECK (cudaEventElapsedTime (&time_ms, event_start, event_end));
288
+ printf (" Shared GPU kernel time: %.9f seconds\n " , time_ms / 1000 );
289
+ total_time_ms += time_ms;
290
+ time_ms = 0.0 ;
291
+
292
+ // Copy Device to Host
293
+ copy_C_D2H (h_C, d_C, bytes, &event_start, &event_end, &total_time_ms, " Shared" );
294
+
295
+ printf (" Shared GPU total time: %.9f seconds\n " , total_time_ms / 1000 );
296
+ total_time_ms = 0.0 ;
297
+
298
+ // Validate
299
+ if (check)
300
+ validation (h_C, C, N);
301
+
302
+ //
303
+ // cuBLAS
304
+ //
305
+ init_C_gpu (h_C, d_C, N);
306
+ cublasHandle_t cublas_handle;
307
+ cublasCreate (&cublas_handle);
308
+
309
+ // Copy Host to Device
310
+ copy_A_B_H2D (h_A, h_B, d_A, d_B, bytes, &event_start, &event_end, &total_time_ms, " cuBLAS" );
311
+
312
+ CUDA_CHECK (cudaEventRecord (event_start));
313
+
314
+ const double alpha = 1.0 ;
315
+ const double beta = 0.0 ;
316
+ cublasDgemm (cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_B, N, d_A, N, &beta, d_C, N);
317
+
318
+ CUDA_CHECK (cudaDeviceSynchronize ());
319
+
320
+ CUDA_CHECK (cudaEventRecord (event_end));
321
+ CUDA_CHECK (cudaEventSynchronize (event_end));
322
+ CUDA_CHECK (cudaEventElapsedTime (&time_ms, event_start, event_end));
323
+ printf (" cuBLAS GPU kernel time: %.9f seconds\n " , time_ms / 1000 );
324
+ total_time_ms += time_ms;
325
+ time_ms = 0.0 ;
326
+
327
+ // Copy Device to Host
328
+ copy_C_D2H (h_C, d_C, bytes, &event_start, &event_end, &total_time_ms, " cuBLAS" );
329
+
330
+ printf (" cuBLAS GPU total time: %.9f seconds\n " , total_time_ms / 1000 );
331
+
332
+ // Validate
333
+ if (check)
334
+ validation (h_C, C, N);
335
+
336
+ //
337
+ // Free memory
338
+ //
339
+ // Host
340
+ free (h_A);
341
+ free (h_B);
342
+ free (h_C);
343
+ free (C);
344
+
345
+ // Device
346
+ CUDA_CHECK (cudaFree (d_A));
347
+ CUDA_CHECK (cudaFree (d_B));
348
+ CUDA_CHECK (cudaFree (d_C));
349
+ CUDA_CHECK (cudaEventDestroy (event_start));
350
+ CUDA_CHECK (cudaEventDestroy (event_end));
351
+ cublasDestroy (cublas_handle);
352
+
353
+ return 0 ;
354
+ }
0 commit comments