Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit 4cde649

Browse files
CUDA OpenACC version
1 parent f90c17f commit 4cde649

File tree

10 files changed

+1899
-0
lines changed

10 files changed

+1899
-0
lines changed
601 KB
Binary file not shown.
790 KB
Binary file not shown.
Lines changed: 354 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,354 @@
1+
#include <stdlib.h>
2+
#include <stdio.h>
3+
#include <time.h>
4+
#include <cuda.h>
5+
#include <cublas_v2.h>
6+
7+
#define BLOCKSIZE 16
8+
9+
// CUDA ERROR CHECK
10+
#define CUDA_CHECK(call) \
11+
do \
12+
{ \
13+
cudaError_t err = (call); \
14+
if (err != cudaSuccess) \
15+
{ \
16+
fprintf(stderr, "CUDA error at %s:%d: %s\n", \
17+
__FILE__, __LINE__, cudaGetErrorString(err)); \
18+
exit(1); \
19+
} \
20+
} while (0)
21+
22+
void matmat_seq(double *A, double *B, double *C, const int N)
23+
{
24+
double sum = 0.0;
25+
for (int i = 0; i < N; i++)
26+
{
27+
for (int j = 0; j < N; j++)
28+
{
29+
sum = 0.0;
30+
for (int k = 0; k < N; k++)
31+
{
32+
sum += A[i * N + k] * B[k * N + j];
33+
}
34+
C[i * N + j] = sum;
35+
}
36+
}
37+
}
38+
39+
40+
__global__ void matmat_naive_kernel(double *A, double *B, double *C, const int N)
41+
{
42+
int i = threadIdx.y + blockDim.y * blockIdx.y;
43+
int j = threadIdx.x + blockDim.x * blockIdx.x;
44+
45+
if (i < N && j < N)
46+
{
47+
double sum = 0.0;
48+
for (int k = 0; k < N; k++)
49+
{
50+
sum += A[i * N + k] * B[k * N + j];
51+
}
52+
C[i * N + j] = sum;
53+
}
54+
}
55+
56+
__global__ void matmat_shared_kernel(double *A, double *B, double *C, const int N)
57+
{
58+
int i = threadIdx.y + blockDim.y * blockIdx.y;
59+
int j = threadIdx.x + blockDim.x * blockIdx.x;
60+
61+
__shared__ double A_s[BLOCKSIZE][BLOCKSIZE];
62+
__shared__ double B_s[BLOCKSIZE][BLOCKSIZE];
63+
64+
double sum = 0.0;
65+
int blocks = (N + BLOCKSIZE - 1) / BLOCKSIZE;
66+
// Loop in tiles
67+
for (int block = 0; block < blocks; block++)
68+
{
69+
// Load A block to shared memory
70+
int k = block * BLOCKSIZE + threadIdx.x;
71+
if (i < N && k < N)
72+
{
73+
A_s[threadIdx.y][threadIdx.x] = A[i * N + k];
74+
}
75+
else
76+
{
77+
A_s[threadIdx.y][threadIdx.x] = 0.0;
78+
}
79+
80+
// Load B block to shared memory
81+
k = block * BLOCKSIZE + threadIdx.y;
82+
if (k < N && j < N)
83+
{
84+
B_s[threadIdx.y][threadIdx.x] = B[k * N + j];
85+
}
86+
else
87+
{
88+
B_s[threadIdx.y][threadIdx.x] = 0.0;
89+
}
90+
// Wait for all threads to load A_s and B_s
91+
__syncthreads();
92+
93+
// Compute the block
94+
// We don't need to check outbounds here
95+
// because if we are out, the Element of A_s or B_s is 0.0
96+
for (int kk = 0; kk < BLOCKSIZE; kk++)
97+
{
98+
sum += A_s[threadIdx.y][kk] * B_s[kk][threadIdx.x];
99+
}
100+
__syncthreads();
101+
}
102+
if (i < N && j < N)
103+
C[i * N + j] = sum;
104+
}
105+
106+
void validation(double *h_C, double *C, const int N)
107+
{
108+
for (int i = 0; i < N; i++)
109+
{
110+
for (int j = 0; j < N; j++)
111+
{
112+
double err = fabs(h_C[i * N + j] - C[i * N + j]);
113+
if (err > 1.0e-6)
114+
{
115+
printf("Error at C[%d][%d]: fabs( %f - %f ) = %e > %e\n", i, j, h_C[i * N + j], C[i * N + j], err, 1.0e-6);
116+
exit(1);
117+
}
118+
}
119+
}
120+
}
121+
122+
void copy_A_B_H2D(double *h_A, double *h_B, double *d_A, double *d_B, const size_t bytes,
123+
cudaEvent_t *event_start, cudaEvent_t *event_end, float *total_time_ms, const char *case_name)
124+
{
125+
float time_ms = 0.0;
126+
CUDA_CHECK(cudaEventRecord(*event_start));
127+
128+
CUDA_CHECK(cudaMemcpy(d_A, h_A, bytes, cudaMemcpyHostToDevice));
129+
CUDA_CHECK(cudaMemcpy(d_B, h_B, bytes, cudaMemcpyHostToDevice));
130+
131+
CUDA_CHECK(cudaEventRecord(*event_end));
132+
CUDA_CHECK(cudaEventSynchronize(*event_end));
133+
CUDA_CHECK(cudaEventElapsedTime(&time_ms, *event_start, *event_end));
134+
printf("%s GPU H2D copy time: %.9f seconds\n", case_name, time_ms / 1000);
135+
*total_time_ms += time_ms;
136+
}
137+
138+
void copy_C_D2H(double *h_C, double *d_C, const size_t bytes,
139+
cudaEvent_t *event_start, cudaEvent_t *event_end, float *total_time_ms, const char *case_name)
140+
{
141+
float time_ms = 0.0;
142+
CUDA_CHECK(cudaEventRecord(*event_start));
143+
144+
CUDA_CHECK(cudaMemcpy(h_C, d_C, bytes, cudaMemcpyDeviceToHost));
145+
146+
CUDA_CHECK(cudaEventRecord(*event_end));
147+
CUDA_CHECK(cudaEventSynchronize(*event_end));
148+
CUDA_CHECK(cudaEventElapsedTime(&time_ms, *event_start, *event_end));
149+
printf("%s GPU D2H copy time: %.9f seconds\n", case_name, time_ms / 1000);
150+
*total_time_ms += time_ms;
151+
}
152+
153+
void init_C_gpu(double *h_C, double *d_C, const int N)
154+
{
155+
for (int i = 0; i < N; i++)
156+
{
157+
for (int j = 0; j < N; j++)
158+
{
159+
h_C[i * N + j] = -1.0;
160+
}
161+
}
162+
163+
CUDA_CHECK(cudaMemset(d_C, 0, N * N * sizeof(double)));
164+
}
165+
166+
int main(int argc, char *argv[])
167+
{
168+
// Argument parsing
169+
if (argc != 3)
170+
{
171+
printf("Usage: %s <matrix size NxN> <check>\n", argv[0]);
172+
return 1;
173+
}
174+
175+
int N = atoi(argv[1]);
176+
int check = atoi(argv[2]);
177+
178+
printf("Matrix size: %d x %d\n", N, N);
179+
180+
//
181+
// Memory allocation
182+
//
183+
// Host
184+
size_t bytes = N * N * sizeof(double);
185+
double *h_A = (double *)malloc(bytes);
186+
double *h_B = (double *)malloc(bytes);
187+
double *h_C = (double *)malloc(bytes);
188+
double *C = (double *)malloc(bytes);
189+
190+
// Device
191+
double *d_A, *d_B, *d_C;
192+
CUDA_CHECK(cudaMalloc((void **)&d_A, bytes));
193+
CUDA_CHECK(cudaMalloc((void **)&d_B, bytes));
194+
CUDA_CHECK(cudaMalloc((void **)&d_C, bytes));
195+
CUDA_CHECK(cudaMemset(d_C, 0, bytes)); // Init d_C to 0
196+
197+
//
198+
// Matrices initialization
199+
//
200+
for (int i = 0; i < N; i++)
201+
{
202+
for (int j = 0; j < N; j++)
203+
{
204+
// Row-major
205+
h_A[i * N + j] = drand48();
206+
h_B[i * N + j] = drand48();
207+
h_C[i * N + j] = -1.0;
208+
C[i * N + j] = -1.0;
209+
}
210+
}
211+
212+
//
213+
// Sequential
214+
//
215+
if (check)
216+
{
217+
struct timespec start, end;
218+
clock_gettime(CLOCK_MONOTONIC, &start);
219+
220+
matmat_seq(h_A, h_B, C, N);
221+
222+
clock_gettime(CLOCK_MONOTONIC, &end);
223+
double elapsed = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1.0e9;
224+
printf("Sequential elapsed time: %.9f seconds\n", elapsed);
225+
}
226+
else
227+
{
228+
printf("Sequential and validation deactivated\n");
229+
}
230+
231+
//
232+
// GPU computations
233+
//
234+
cudaEvent_t event_start, event_end;
235+
float time_ms = 0.0;
236+
float total_time_ms = 0.0;
237+
CUDA_CHECK(cudaEventCreate(&event_start));
238+
CUDA_CHECK(cudaEventCreate(&event_end));
239+
240+
//
241+
// Naive kernel
242+
//
243+
// Copy Host to Device
244+
copy_A_B_H2D(h_A, h_B, d_A, d_B, bytes, &event_start, &event_end, &total_time_ms, "Naive");
245+
246+
// Kernel launch
247+
dim3 threads(BLOCKSIZE, BLOCKSIZE);
248+
dim3 blocks((N + BLOCKSIZE - 1) / BLOCKSIZE, (N + BLOCKSIZE - 1) / BLOCKSIZE);
249+
CUDA_CHECK(cudaEventRecord(event_start));
250+
251+
matmat_naive_kernel<<<blocks, threads>>>(d_A, d_B, d_C, N);
252+
CUDA_CHECK(cudaGetLastError());
253+
CUDA_CHECK(cudaDeviceSynchronize());
254+
255+
CUDA_CHECK(cudaEventRecord(event_end));
256+
CUDA_CHECK(cudaEventSynchronize(event_end));
257+
CUDA_CHECK(cudaEventElapsedTime(&time_ms, event_start, event_end));
258+
printf("Naive GPU kernel time: %.9f seconds\n", time_ms / 1000);
259+
total_time_ms += time_ms;
260+
time_ms = 0.0;
261+
262+
// Copy Device to Host
263+
copy_C_D2H(h_C, d_C, bytes, &event_start, &event_end, &total_time_ms, "Naive");
264+
265+
printf("Naive GPU total time: %.9f seconds\n", total_time_ms / 1000);
266+
total_time_ms = 0.0;
267+
268+
// Validate
269+
if (check)
270+
validation(h_C, C, N);
271+
272+
//
273+
// Shared memory kernel
274+
//
275+
init_C_gpu(h_C, d_C, N);
276+
// Copy Host to Device
277+
copy_A_B_H2D(h_A, h_B, d_A, d_B, bytes, &event_start, &event_end, &total_time_ms, "Shared");
278+
279+
// Kernel launch
280+
CUDA_CHECK(cudaEventRecord(event_start));
281+
matmat_shared_kernel<<<blocks, threads>>>(d_A, d_B, d_C, N);
282+
CUDA_CHECK(cudaGetLastError());
283+
CUDA_CHECK(cudaDeviceSynchronize());
284+
285+
CUDA_CHECK(cudaEventRecord(event_end));
286+
CUDA_CHECK(cudaEventSynchronize(event_end));
287+
CUDA_CHECK(cudaEventElapsedTime(&time_ms, event_start, event_end));
288+
printf("Shared GPU kernel time: %.9f seconds\n", time_ms / 1000);
289+
total_time_ms += time_ms;
290+
time_ms = 0.0;
291+
292+
// Copy Device to Host
293+
copy_C_D2H(h_C, d_C, bytes, &event_start, &event_end, &total_time_ms, "Shared");
294+
295+
printf("Shared GPU total time: %.9f seconds\n", total_time_ms / 1000);
296+
total_time_ms = 0.0;
297+
298+
// Validate
299+
if (check)
300+
validation(h_C, C, N);
301+
302+
//
303+
// cuBLAS
304+
//
305+
init_C_gpu(h_C, d_C, N);
306+
cublasHandle_t cublas_handle;
307+
cublasCreate(&cublas_handle);
308+
309+
// Copy Host to Device
310+
copy_A_B_H2D(h_A, h_B, d_A, d_B, bytes, &event_start, &event_end, &total_time_ms, "cuBLAS");
311+
312+
CUDA_CHECK(cudaEventRecord(event_start));
313+
314+
const double alpha = 1.0;
315+
const double beta = 0.0;
316+
cublasDgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_B, N, d_A, N, &beta, d_C, N);
317+
318+
CUDA_CHECK(cudaDeviceSynchronize());
319+
320+
CUDA_CHECK(cudaEventRecord(event_end));
321+
CUDA_CHECK(cudaEventSynchronize(event_end));
322+
CUDA_CHECK(cudaEventElapsedTime(&time_ms, event_start, event_end));
323+
printf("cuBLAS GPU kernel time: %.9f seconds\n", time_ms / 1000);
324+
total_time_ms += time_ms;
325+
time_ms = 0.0;
326+
327+
// Copy Device to Host
328+
copy_C_D2H(h_C, d_C, bytes, &event_start, &event_end, &total_time_ms, "cuBLAS");
329+
330+
printf("cuBLAS GPU total time: %.9f seconds\n", total_time_ms / 1000);
331+
332+
// Validate
333+
if (check)
334+
validation(h_C, C, N);
335+
336+
//
337+
// Free memory
338+
//
339+
// Host
340+
free(h_A);
341+
free(h_B);
342+
free(h_C);
343+
free(C);
344+
345+
// Device
346+
CUDA_CHECK(cudaFree(d_A));
347+
CUDA_CHECK(cudaFree(d_B));
348+
CUDA_CHECK(cudaFree(d_C));
349+
CUDA_CHECK(cudaEventDestroy(event_start));
350+
CUDA_CHECK(cudaEventDestroy(event_end));
351+
cublasDestroy(cublas_handle);
352+
353+
return 0;
354+
}

0 commit comments

Comments
(0)

AltStyle によって変換されたページ (->オリジナル) /