00001 /*************************************************************************** 00002 *cr 00003 *cr (C) Copyright 1995-2019 The Board of Trustees of the 00004 *cr University of Illinois 00005 *cr All Rights Reserved 00006 *cr 00007 ***************************************************************************/ 00008 /*************************************************************************** 00009 * RCS INFORMATION: 00010 * 00011 * $RCSfile: CUDAGaussianBlur.cu,v $ 00012 * $Author: johns $ $Locker: $ $State: Exp $ 00013 * $Revision: 1.18 $ $Date: 2020年02月26日 20:31:15 $ 00014 * 00015 ***************************************************************************/ 00024 #define WATERSHED_INTERNAL 1 00025 00026 #include <stdio.h> 00027 #include "CUDAGaussianBlur.h" 00028 #include "Watershed.h" 00029 #include "ProfileHooks.h" 00030 #include <cuda_fp16.h> 00031 00032 #if 0 00033 #define CUERR { cudaError_t err; \ 00034 cudaDeviceSynchronize(); \ 00035 if ((err = cudaGetLastError()) != cudaSuccess) { \ 00036 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \ 00037 }} 00038 #else 00039 #define CUERR 00040 #endif 00041 00042 #define KERNEL_MAX_SIZE 4096 00043 00044 __constant__ float kernel_d_c[KERNEL_MAX_SIZE]; 00045 00046 #define BLOCKDIM_X 128 00047 #define BLOCKDIM_Y 1 00048 #define BLOCKDIM_Z 1 00049 00050 // optimized for 21 kernel size 00051 #define XCONV_BLOCKDIM_X 96 00052 #define XCONV_BLOCKDIM_Y 1 00053 #define XCONV_BLOCKDIM_Z 1 00054 00055 #define YCONV_BLOCKDIM_X 16 00056 #define YCONV_BLOCKDIM_Y 8 00057 #define YCONV_BLOCKDIM_Z 1 00058 00059 #define ZCONV_BLOCKDIM_X 4 00060 #define ZCONV_BLOCKDIM_Y 1 00061 #define ZCONV_BLOCKDIM_Z 32 00062 00063 #define X_PADDING 32 00064 00065 #define SWAPF(a, b) {\ 00066 float* t = a;\ 00067 a = b;\ 00068 b = t;\ 00069 } 00070 00071 #define INST_GAUSSIAN_CUDA(I_T) \ 00072 template void gaussian1D_x_cuda<I_T>(I_T* src_d, I_T* dst_d, int kernel_size,\ 00073 int width, int height, int depth);\ 00074 template void gaussian1D_y_cuda<I_T>(I_T* src_d, I_T* dst_d, int kernel_size,\ 00075 int width, int height, int depth);\ 00076 template void gaussian1D_z_cuda<I_T>(I_T* src_d, I_T* dst_d, int kernel_size,\ 00077 int width, int height, int depth); 00078 00079 INST_GAUSSIAN_CUDA(float) 00080 //INST_GAUSSIAN_CUDA(half) 00081 INST_GAUSSIAN_CUDA(unsigned short) 00082 INST_GAUSSIAN_CUDA(unsigned char) 00083 00084 // Function that converts val to the appropriate type and then assigns it to arr[idx] 00085 // This is necessary for coverting an real type to integer type so we can handle rounding 00086 template <typename T> 00087 __device__ __forceinline__ void convert_type_and_assign_val(T* arr, long idx, float val) { 00088 arr[idx] = roundf(val); 00089 } 00090 00091 template <> __device__ __forceinline__ void convert_type_and_assign_val<float>(float* arr, long idx, float val) { 00092 arr[idx] = val; 00093 } 00094 00095 #if CUDART_VERSION >= 9000 00096 // half-precision only became mainstream with CUDA 9.x and later versions 00097 template <> __device__ __forceinline__ void convert_type_and_assign_val<half>(half* arr, long idx, float val) { 00098 arr[idx] = val; 00099 } 00100 #endif 00101 00102 void copy_array_from_gpu(void* arr, void* arr_d, int bytes) { 00103 PROFILE_PUSH_RANGE("CUDAGaussianBlur::copy_array_from_gpu()", 5); 00104 00105 cudaMemcpy(arr, arr_d, bytes, cudaMemcpyDeviceToHost); 00106 CUERR // check and clear any existing errors 00107 00108 PROFILE_POP_RANGE(); 00109 } 00110 00111 00112 void copy_array_to_gpu(void* arr_d, void* arr, int bytes) { 00113 cudaMemcpyAsync(arr_d, arr, bytes, cudaMemcpyHostToDevice); 00114 CUERR // check and clear any existing errors 00115 } 00116 00117 00118 void* alloc_cuda_array(int bytes) { 00119 void* t; 00120 cudaMalloc(&t, bytes); 00121 CUERR // check and clear any existing errors 00122 return t; 00123 } 00124 00125 00126 void free_cuda_array(void* arr) { 00127 cudaFree(arr); 00128 } 00129 00130 00131 void set_gaussian_1D_kernel_cuda(float* kernel, int kernel_size) { 00132 00133 if (kernel_size > KERNEL_MAX_SIZE) { 00134 printf("Warning: exceeded maximum kernel size on GPU.\n"); 00135 kernel_size = KERNEL_MAX_SIZE; 00136 } 00137 00138 cudaMemcpyToSymbolAsync(kernel_d_c, kernel, kernel_size * sizeof(float)); 00139 CUERR // check and clear any existing errors 00140 } 00141 00142 00143 template <typename IMAGE_T, const int offset> 00144 __global__ void convolution_x_kernel_s(const IMAGE_T* __restrict__ src_d, 00145 IMAGE_T* __restrict__ dst_d, int width, int height, int depth) { 00146 __shared__ IMAGE_T src_s[XCONV_BLOCKDIM_Z * XCONV_BLOCKDIM_Y * (XCONV_BLOCKDIM_X + offset * 2)]; 00147 const int x = blockIdx.x * XCONV_BLOCKDIM_X + threadIdx.x; 00148 const int y = blockIdx.y * XCONV_BLOCKDIM_Y + threadIdx.y; 00149 const int z = blockIdx.z * XCONV_BLOCKDIM_Z + threadIdx.z; 00150 00151 if (y >= height || z >= depth) { 00152 return; 00153 } 00154 00155 // global idx 00156 const long idx = z * long(height) * long(width) + y * long(width) + x; 00157 00158 // idx of first entry in row (x = 0) 00159 const long g_row_base_idx = z * long(height) * long(width) + y * long(width); 00160 00161 // idx of first entry in shared mem array row 00162 const int t_row_base_idx = threadIdx.z * XCONV_BLOCKDIM_Y * (XCONV_BLOCKDIM_X + offset * 2) 00163 + threadIdx.y * (XCONV_BLOCKDIM_X + offset*2) + offset; 00164 00165 // idx of first x coord in this block 00166 const int base_x = x - threadIdx.x; 00167 00168 // Check if we need to deal with edge cases 00169 if (base_x - offset < 0 || base_x + XCONV_BLOCKDIM_X + offset >= width) { 00170 for (int i = threadIdx.x - offset; i < XCONV_BLOCKDIM_X + offset; i += XCONV_BLOCKDIM_X) { 00171 int x_offset = base_x + i; 00172 if (x_offset < 0) // left edge case 00173 x_offset = 0; 00174 else if (x_offset >= width) // right edge case 00175 x_offset = width-1; 00176 src_s[t_row_base_idx + i] = src_d[g_row_base_idx + x_offset]; 00177 } 00178 } else { 00179 for (int i = threadIdx.x - offset; i < XCONV_BLOCKDIM_X + offset; i += XCONV_BLOCKDIM_X) { 00180 int x_offset = base_x + i; 00181 src_s[t_row_base_idx + i] = src_d[g_row_base_idx + x_offset]; 00182 } 00183 } 00184 if (x >= width) 00185 return; 00186 00187 // XXX is this safe to do with early returns on all GPUs? 00188 __syncthreads(); 00189 00190 const IMAGE_T* src_s_offset = src_s + t_row_base_idx + threadIdx.x; 00191 const float* kernel_offset = kernel_d_c + offset; 00192 float value = 0.0f; 00193 #pragma unroll 00194 for (int i = -offset; i <= offset; ++i) { 00195 value += src_s_offset[i] * kernel_offset[i]; 00196 } 00197 00198 convert_type_and_assign_val(dst_d, idx, value); 00199 } 00200 00201 00202 00203 template <typename IMAGE_T, const int offset> 00204 __global__ void convolution_y_kernel_s(const IMAGE_T* __restrict__ src_d, 00205 IMAGE_T* __restrict__ dst_d, int width, int height, int depth) { 00206 __shared__ IMAGE_T src_s[YCONV_BLOCKDIM_Z * (YCONV_BLOCKDIM_Y + 2*offset) * YCONV_BLOCKDIM_X]; 00207 const int x = blockIdx.x * YCONV_BLOCKDIM_X + threadIdx.x; 00208 const int y = blockIdx.y * YCONV_BLOCKDIM_Y + threadIdx.y; 00209 const int z = blockIdx.z * YCONV_BLOCKDIM_Z + threadIdx.z; 00210 00211 if (x >= width || z >= depth) { 00212 return; 00213 } 00214 00215 // global idx 00216 const long idx = z * long(height) * long(width) + y * long(width) + x; 00217 00218 // idx of first entry in column (y = 0) 00219 const long g_col_base_idx = z * long(height) * long(width) + x; 00220 00221 // idx of first entry in shared mem array column 00222 const int t_col_base_idx = threadIdx.z * (YCONV_BLOCKDIM_Y +offset*2) * YCONV_BLOCKDIM_X 00223 + threadIdx.x + offset * YCONV_BLOCKDIM_X; 00224 00225 // idx of first y coord in this block 00226 const int base_y = y - threadIdx.y; 00227 00228 // Check if we need to deal with edge cases 00229 if (base_y - offset < 0 || base_y + YCONV_BLOCKDIM_Y - 1 + offset >= height) { 00230 for (int i = threadIdx.y - offset; i < YCONV_BLOCKDIM_Y + offset; i += YCONV_BLOCKDIM_Y) { 00231 int y_offset = base_y + i; 00232 if (y_offset < 0) // left edge case 00233 y_offset = 0; 00234 else if (y_offset >= height) // right edge case 00235 y_offset = height-1; 00236 src_s[t_col_base_idx + i*YCONV_BLOCKDIM_X] = src_d[g_col_base_idx + y_offset*width]; 00237 } 00238 } else { 00239 for (int i = threadIdx.y - offset; i < YCONV_BLOCKDIM_Y + offset; i += YCONV_BLOCKDIM_Y) { 00240 int y_offset = base_y + i; 00241 src_s[t_col_base_idx + i*YCONV_BLOCKDIM_X] = src_d[g_col_base_idx + y_offset*width]; 00242 } 00243 } 00244 if (y >= height) 00245 return; 00246 00247 // XXX is this safe to do with early returns on all GPUs? 00248 __syncthreads(); 00249 00250 const IMAGE_T* src_s_offset = src_s + t_col_base_idx + threadIdx.y * YCONV_BLOCKDIM_X; 00251 const float* kernel_offset = kernel_d_c + offset; 00252 float value = 0.0f; 00253 #pragma unroll 00254 for (int i = -offset; i <= offset; ++i) { 00255 value += src_s_offset[i*YCONV_BLOCKDIM_X] * kernel_offset[i]; 00256 } 00257 00258 convert_type_and_assign_val(dst_d, idx, value); 00259 } 00260 00261 00262 template <typename IMAGE_T, const int offset> 00263 __global__ void convolution_z_kernel_s(const IMAGE_T* __restrict__ src_d, 00264 IMAGE_T* __restrict__ dst_d, int width, int height, int depth) { 00265 __shared__ IMAGE_T src_s[ZCONV_BLOCKDIM_Z * (ZCONV_BLOCKDIM_Y + 2*offset) * ZCONV_BLOCKDIM_X]; 00266 const int x = blockIdx.x * ZCONV_BLOCKDIM_X + threadIdx.x; 00267 const int y = blockIdx.y * ZCONV_BLOCKDIM_Y + threadIdx.y; 00268 const int z = blockIdx.z * ZCONV_BLOCKDIM_Z + threadIdx.z; 00269 00270 if (x >= width || y >= height) { 00271 return; 00272 } 00273 00274 // global idx 00275 const long idx = z * long(height) * long(width) + y * long(width) + x; 00276 00277 // idx of first entry in column (z = 0) 00278 const long g_base_col_idx = y * long(width) + x; 00279 00280 // idx of first entry in shared mem array column 00281 const int t_base_col_idx = threadIdx.y * ZCONV_BLOCKDIM_X 00282 + threadIdx.x + offset * ZCONV_BLOCKDIM_X; 00283 const int base_z = z - threadIdx.z; 00284 const long heightWidth = long(height) * long(width); 00285 float value = 0.0f; 00286 00287 // Check if we need to deal with edge cases 00288 if (base_z - offset < 0 || base_z + ZCONV_BLOCKDIM_Z - 1 + offset >= depth) { 00289 for (int i = threadIdx.z - offset; i < ZCONV_BLOCKDIM_Z + offset; i += ZCONV_BLOCKDIM_Z) { 00290 int z_offset = base_z + i; 00291 if (z_offset < 0) // left edge case 00292 z_offset = 0; 00293 else if (z_offset >= depth) // right edge case 00294 z_offset = depth-1; 00295 src_s[t_base_col_idx + i*ZCONV_BLOCKDIM_X*ZCONV_BLOCKDIM_Y] 00296 = src_d[g_base_col_idx + z_offset*heightWidth]; 00297 } 00298 } else { 00299 for (int i = threadIdx.z - offset; i < ZCONV_BLOCKDIM_Z + offset; i += ZCONV_BLOCKDIM_Z) { 00300 int z_offset = base_z + i; 00301 src_s[t_base_col_idx + i*ZCONV_BLOCKDIM_X*ZCONV_BLOCKDIM_Y] 00302 = src_d[g_base_col_idx + z_offset*heightWidth]; 00303 } 00304 } 00305 00306 if (z >= depth) 00307 return; 00308 00309 __syncthreads(); 00310 00311 const IMAGE_T* src_s_offset = src_s + t_base_col_idx + threadIdx.z 00312 * ZCONV_BLOCKDIM_X*ZCONV_BLOCKDIM_Y; 00313 const float* kernel_offset = kernel_d_c + offset; 00314 #pragma unroll 00315 for (int i = -offset; i <= offset; ++i) { 00316 value += src_s_offset[i*ZCONV_BLOCKDIM_X*ZCONV_BLOCKDIM_Y] * kernel_offset[i]; 00317 } 00318 00319 convert_type_and_assign_val(dst_d, idx, value); 00320 } 00321 00322 00323 template <typename IMAGE_T, const int offset> 00324 __global__ void convolution_x_kernel(const IMAGE_T* __restrict__ src_d, 00325 IMAGE_T* __restrict__ dst_d, int width, int height, int depth) { 00326 const int x = blockIdx.x * XCONV_BLOCKDIM_X + threadIdx.x; 00327 const int y = blockIdx.y * XCONV_BLOCKDIM_Y + threadIdx.y; 00328 const int z = blockIdx.z * XCONV_BLOCKDIM_Z + threadIdx.z; 00329 00330 if (x >= width || y >= height || z >= depth) { 00331 return; 00332 } 00333 00334 const long idx = z * long(height) * long(width) + y * long(width) + x; 00335 const int offset_neg = x - offset >= 0 ? -offset : -x; 00336 const int offset_pos = x + offset < width ? offset : width - x - 1; 00337 const float* kernel_offset = kernel_d_c + offset; 00338 const IMAGE_T* src_idx = src_d + idx; 00339 float value = 0.0f; 00340 00341 if (offset_neg == -offset && offset_pos == offset) { 00342 #pragma unroll 00343 for (int i = -offset; i <= offset; ++i) { 00344 value += src_idx[i] * kernel_offset[i]; 00345 } 00346 } else { 00347 // Handle boundary condition 00348 for (int i = -offset; i < offset_neg; ++i) { 00349 value += src_idx[offset_neg] * kernel_offset[i]; 00350 } 00351 00352 for (int i = offset_neg; i < offset_pos; ++i) { 00353 value += src_idx[i] * kernel_offset[i]; 00354 } 00355 00356 // Handle boundary condition 00357 for (int i = offset_pos; i <= offset; ++i) { 00358 value += src_idx[offset_pos] * kernel_offset[i]; 00359 } 00360 } 00361 00362 convert_type_and_assign_val(dst_d, idx, value); 00363 } 00364 00365 00366 template <typename IMAGE_T> 00367 __global__ void convolution_x_kernel(const IMAGE_T* __restrict__ src_d, 00368 IMAGE_T* __restrict__ dst_d, int offset, int width, int height, int depth) { 00369 const int x = blockIdx.x * XCONV_BLOCKDIM_X + threadIdx.x; 00370 const int y = blockIdx.y * XCONV_BLOCKDIM_Y + threadIdx.y; 00371 const int z = blockIdx.z * XCONV_BLOCKDIM_Z + threadIdx.z; 00372 00373 if (x >= width || y >= height || z >= depth) { 00374 return; 00375 } 00376 00377 const long idx = z * long(height) * long(width) + y * long(width) + x; 00378 const int offset_neg = x - offset >= 0 ? -offset : -x; 00379 const int offset_pos = x + offset < width ? offset : width - x - 1; 00380 const float* kernel_offset = kernel_d_c + offset; 00381 const IMAGE_T* src_idx = src_d + idx; 00382 float value = 0.0f; 00383 00384 // Handle boundary condition 00385 for (int i = -offset; i < offset_neg; ++i) { 00386 value += src_idx[offset_neg] * kernel_offset[i]; 00387 } 00388 00389 for (int i = offset_neg; i < offset_pos; ++i) { 00390 value += src_idx[i] * kernel_offset[i]; 00391 } 00392 00393 // Handle boundary condition 00394 for (int i = offset_pos; i <= offset; ++i) { 00395 value += src_idx[offset_pos] * kernel_offset[i]; 00396 } 00397 00398 convert_type_and_assign_val(dst_d, idx, value); 00399 } 00400 00401 00402 template <typename IMAGE_T, const int offset> 00403 __global__ void convolution_y_kernel(const IMAGE_T* __restrict__ src_d, 00404 IMAGE_T* __restrict__ dst_d, int width, int height, int depth) { 00405 const int x = blockIdx.x * YCONV_BLOCKDIM_X + threadIdx.x; 00406 const int y = blockIdx.y * YCONV_BLOCKDIM_Y + threadIdx.y; 00407 const int z = blockIdx.z * YCONV_BLOCKDIM_Z + threadIdx.z; 00408 00409 if (x >= width || y >= height || z >= depth) { 00410 return; 00411 } 00412 00413 const long idx = z * long(height) * long(width) + y * long(width) + x; 00414 const int offset_neg = y - offset >= 0 ? -offset : -y; 00415 const int offset_pos = y + offset < height ? offset : height - y - 1; 00416 const float* kernel_offset = kernel_d_c + offset; 00417 const IMAGE_T* src_idx = src_d + idx; 00418 float value = 0.0f; 00419 00420 if (offset_neg == -offset && offset_pos == offset) { 00421 #pragma unroll 00422 for (int i = -offset; i <= offset; ++i) { 00423 value += src_idx[i*width] * kernel_offset[i]; 00424 } 00425 } else { 00426 // Handle boundary condition 00427 for (int i = -offset; i < offset_neg; ++i) { 00428 value += src_idx[offset_neg*width] * kernel_offset[i]; 00429 } 00430 00431 for (int i = offset_neg; i < offset_pos; ++i) { 00432 value += src_idx[i*width] * kernel_offset[i]; 00433 } 00434 00435 // Handle boundary condition 00436 for (int i = offset_pos; i <= offset; ++i) { 00437 value += src_idx[offset_pos*width] * kernel_offset[i]; 00438 } 00439 } 00440 00441 convert_type_and_assign_val(dst_d, idx, value); 00442 } 00443 00444 00445 template <typename IMAGE_T> 00446 __global__ void convolution_y_kernel(const IMAGE_T* __restrict__ src_d, 00447 IMAGE_T* __restrict__ dst_d, int offset, int width, int height, int depth) { 00448 const int x = blockIdx.x * YCONV_BLOCKDIM_X + threadIdx.x; 00449 const int y = blockIdx.y * YCONV_BLOCKDIM_Y + threadIdx.y; 00450 const int z = blockIdx.z * YCONV_BLOCKDIM_Z + threadIdx.z; 00451 00452 if (x >= width || y >= height || z >= depth) { 00453 return; 00454 } 00455 00456 const long idx = z * long(height) * long(width) + y * long(width) + x; 00457 const int offset_neg = y - offset >= 0 ? -offset : -y; 00458 const int offset_pos = y + offset < height ? offset : height - y - 1; 00459 const float* kernel_offset = kernel_d_c + offset; 00460 const IMAGE_T* src_idx = src_d + idx; 00461 float value = 0.0f; 00462 00463 // Handle boundary condition 00464 for (int i = -offset; i < offset_neg; ++i) { 00465 value += src_idx[offset_neg*width] * kernel_offset[i]; 00466 } 00467 00468 for (int i = offset_neg; i < offset_pos; ++i) { 00469 value += src_idx[i*width] * kernel_offset[i]; 00470 } 00471 00472 // Handle boundary condition 00473 for (int i = offset_pos; i <= offset; ++i) { 00474 value += src_idx[offset_pos*width] * kernel_offset[i]; 00475 } 00476 00477 convert_type_and_assign_val(dst_d, idx, value); 00478 } 00479 00480 00481 template <typename IMAGE_T> 00482 __global__ void convolution_z_kernel(const IMAGE_T* __restrict__ src_d, 00483 IMAGE_T* __restrict__ dst_d, int offset, int width, int height, int depth) { 00484 const int x = blockIdx.x * ZCONV_BLOCKDIM_X + threadIdx.x; 00485 const int y = blockIdx.y * ZCONV_BLOCKDIM_Y + threadIdx.y; 00486 const int z = blockIdx.z * ZCONV_BLOCKDIM_Z + threadIdx.z; 00487 00488 if (x >= width || y >= height || z >= depth) { 00489 return; 00490 } 00491 00492 const int idx = z * long(height) * long(width) + y * long(width) + x; 00493 const int offset_neg = z - offset >= 0 ? -offset : -z; 00494 const int offset_pos = z + offset < depth ? offset : depth - z - 1; 00495 const long heightWidth = long(height) * long(width); 00496 const float* kernel_offset = kernel_d_c + offset; 00497 const IMAGE_T* src_idx = src_d + idx; 00498 float value = 0.0f; 00499 00500 // Handle boundary condition 00501 for (int i = -offset; i < offset_neg; ++i) { 00502 value += src_idx[offset_neg*heightWidth] * kernel_offset[i]; 00503 } 00504 00505 for (int i = offset_neg; i < offset_pos; ++i) { 00506 value += src_idx[i*heightWidth] * kernel_offset[i]; 00507 } 00508 00509 // Handle boundary condition 00510 for (int i = offset_pos; i <= offset; ++i) { 00511 value += src_idx[offset_pos*heightWidth] * kernel_offset[i]; 00512 } 00513 00514 convert_type_and_assign_val(dst_d, idx, value); 00515 } 00516 00517 00518 template <typename IMAGE_T, const int offset> 00519 __global__ void convolution_z_kernel(const IMAGE_T* __restrict__ src_d, 00520 IMAGE_T* __restrict__ dst_d, int width, int height, int depth) { 00521 const int x = blockIdx.x * ZCONV_BLOCKDIM_X + threadIdx.x; 00522 const int y = blockIdx.y * ZCONV_BLOCKDIM_Y + threadIdx.y; 00523 const int z = blockIdx.z * ZCONV_BLOCKDIM_Z + threadIdx.z; 00524 00525 if (x >= width || y >= height || z >= depth) { 00526 return; 00527 } 00528 00529 const long idx = z * long(height) * long(width) + y * long(width) + x; 00530 const int offset_neg = z - offset >= 0 ? -offset : -z; 00531 const int offset_pos = z + offset < depth ? offset : depth - z - 1; 00532 const long heightWidth = long(height) * long(width); 00533 const float* kernel_offset = kernel_d_c + offset; 00534 const IMAGE_T* src_idx = src_d + idx; 00535 float value = 0.0f; 00536 00537 if (offset_neg == -offset && offset_pos == offset) { 00538 #pragma unroll 00539 for (int i = -offset; i <= offset; ++i) { 00540 value += src_idx[i*heightWidth] * kernel_offset[i]; 00541 } 00542 } else { 00543 // Handle boundary condition 00544 for (int i = -offset; i < offset_neg; ++i) { 00545 value += src_idx[offset_neg*heightWidth] * kernel_offset[i]; 00546 } 00547 00548 for (int i = offset_neg; i < offset_pos; ++i) { 00549 value += src_idx[i*heightWidth] * kernel_offset[i]; 00550 } 00551 00552 // Handle boundary condition 00553 for (int i = offset_pos; i <= offset; ++i) { 00554 value += src_idx[offset_pos*heightWidth] * kernel_offset[i]; 00555 } 00556 } 00557 00558 00559 convert_type_and_assign_val(dst_d, idx, value); 00560 } 00561 00562 00563 template <typename IMAGE_T> 00564 void gaussian1D_x_cuda(IMAGE_T* src_d, IMAGE_T* dst_d, int kernel_size, 00565 int width, int height, int depth) { 00566 long x_grid = (width + XCONV_BLOCKDIM_X - 1)/XCONV_BLOCKDIM_X; 00567 long y_grid = (height + XCONV_BLOCKDIM_Y - 1)/XCONV_BLOCKDIM_Y; 00568 long z_grid = (depth + XCONV_BLOCKDIM_Z - 1)/XCONV_BLOCKDIM_Z; 00569 dim3 block(XCONV_BLOCKDIM_X, XCONV_BLOCKDIM_Y, XCONV_BLOCKDIM_Z); 00570 dim3 grid(x_grid, y_grid, z_grid); 00571 00572 // XXX TODO 00573 // This comment applies to all gaussian1D_XX_cuda kernels. 00574 // We should probably figure out a few different sizes like 00575 // 7, 15, 25, 35, 47, 59, etc that are farther apart 00576 // and then for each blur round the kernel size up to the nearest 00577 // supported size. Since we are ~doubling the sigma after each round 00578 // of blurring this current scheme is inefficient as only a few are used 00579 // before falling through to the default case. 00580 // If we build the kernel using the same sigma but a larger kernel_size 00581 // it should generate the same (or slightly more accurate) results 00582 // and the perf gained of using a templated kernel is much greater 00583 // than the hit from using a slightly larger than needed kernel_size 00584 00585 switch (kernel_size) { 00586 case 3: 00587 convolution_x_kernel_s<IMAGE_T, 3/2><<<grid, block>>>(src_d, dst_d, 00588 width, height, depth); 00589 break; 00590 case 5: 00591 convolution_x_kernel_s<IMAGE_T, 5/2><<<grid, block>>>(src_d, dst_d, 00592 width, height, depth); 00593 break; 00594 case 7: 00595 convolution_x_kernel_s<IMAGE_T, 7/2><<<grid, block>>>(src_d, dst_d, 00596 width, height, depth); 00597 break; 00598 case 9: 00599 convolution_x_kernel_s<IMAGE_T, 9/2><<<grid, block>>>(src_d, dst_d, 00600 width, height, depth); 00601 break; 00602 case 11: 00603 convolution_x_kernel_s<IMAGE_T, 11/2><<<grid, block>>>(src_d, dst_d, 00604 width, height, depth); 00605 break; 00606 case 13: 00607 convolution_x_kernel_s<IMAGE_T, 13/2><<<grid, block>>>(src_d, dst_d, 00608 width, height, depth); 00609 break; 00610 case 15: 00611 convolution_x_kernel_s<IMAGE_T, 15/2><<<grid, block>>>(src_d, dst_d, 00612 width, height, depth); 00613 break; 00614 case 17: 00615 convolution_x_kernel_s<IMAGE_T, 17/2><<<grid, block>>>(src_d, dst_d, 00616 width, height, depth); 00617 break; 00618 case 19: 00619 convolution_x_kernel_s<IMAGE_T, 19/2><<<grid, block>>>(src_d, dst_d, 00620 width, height, depth); 00621 break; 00622 case 21: 00623 convolution_x_kernel_s<IMAGE_T, 21/2><<<grid, block>>>(src_d, dst_d, 00624 width, height, depth); 00625 break; 00626 default: 00627 convolution_x_kernel<IMAGE_T><<<grid, block>>>(src_d, dst_d, 00628 kernel_size/2, width, height, depth); 00629 break; 00630 } 00631 00632 CUERR // check and clear any existing errors 00633 } 00634 00635 00636 template <typename IMAGE_T> 00637 void gaussian1D_y_cuda(IMAGE_T* src_d, IMAGE_T* dst_d, int kernel_size, 00638 int width, int height, int depth) { 00639 long x_grid = (width + YCONV_BLOCKDIM_X - 1)/YCONV_BLOCKDIM_X; 00640 long y_grid = (height + YCONV_BLOCKDIM_Y - 1)/YCONV_BLOCKDIM_Y; 00641 long z_grid = (depth + YCONV_BLOCKDIM_Z - 1)/YCONV_BLOCKDIM_Z; 00642 dim3 block(YCONV_BLOCKDIM_X, YCONV_BLOCKDIM_Y, YCONV_BLOCKDIM_Z); 00643 dim3 grid(x_grid, y_grid, z_grid); 00644 00645 switch (kernel_size) { 00646 case 3: 00647 convolution_y_kernel_s<IMAGE_T, 3/2><<<grid, block>>>(src_d, dst_d, 00648 width, height, depth); 00649 break; 00650 case 5: 00651 convolution_y_kernel_s<IMAGE_T, 5/2><<<grid, block>>>(src_d, dst_d, 00652 width, height, depth); 00653 break; 00654 case 7: 00655 convolution_y_kernel_s<IMAGE_T, 7/2><<<grid, block>>>(src_d, dst_d, 00656 width, height, depth); 00657 break; 00658 case 9: 00659 convolution_y_kernel_s<IMAGE_T, 9/2><<<grid, block>>>(src_d, dst_d, 00660 width, height, depth); 00661 break; 00662 case 11: 00663 convolution_y_kernel_s<IMAGE_T, 11/2><<<grid, block>>>(src_d, dst_d, 00664 width, height, depth); 00665 break; 00666 case 13: 00667 convolution_y_kernel_s<IMAGE_T, 13/2><<<grid, block>>>(src_d, dst_d, 00668 width, height, depth); 00669 break; 00670 case 15: 00671 convolution_y_kernel_s<IMAGE_T, 15/2><<<grid, block>>>(src_d, dst_d, 00672 width, height, depth); 00673 break; 00674 case 17: 00675 convolution_y_kernel_s<IMAGE_T, 17/2><<<grid, block>>>(src_d, dst_d, 00676 width, height, depth); 00677 break; 00678 case 19: 00679 convolution_y_kernel_s<IMAGE_T, 19/2><<<grid, block>>>(src_d, dst_d, 00680 width, height, depth); 00681 break; 00682 case 21: 00683 convolution_y_kernel_s<IMAGE_T, 21/2><<<grid, block>>>(src_d, dst_d, 00684 width, height, depth); 00685 break; 00686 default: 00687 convolution_y_kernel<IMAGE_T><<<grid, block>>>(src_d, dst_d, 00688 kernel_size/2, width, height, depth); 00689 break; 00690 } 00691 00692 CUERR // check and clear any existing errors 00693 } 00694 00695 00696 template <typename IMAGE_T> 00697 void gaussian1D_z_cuda(IMAGE_T* src_d, IMAGE_T* dst_d, int kernel_size, 00698 int width, int height, int depth) { 00699 long x_grid = (width + ZCONV_BLOCKDIM_X - 1)/ZCONV_BLOCKDIM_X; 00700 long y_grid = (height + ZCONV_BLOCKDIM_Y - 1)/ZCONV_BLOCKDIM_Y; 00701 long z_grid = (depth + ZCONV_BLOCKDIM_Z - 1)/ZCONV_BLOCKDIM_Z; 00702 dim3 block(ZCONV_BLOCKDIM_X, ZCONV_BLOCKDIM_Y, ZCONV_BLOCKDIM_Z); 00703 dim3 grid(x_grid, y_grid, z_grid); 00704 00705 switch (kernel_size) { 00706 case 3: 00707 convolution_z_kernel_s<IMAGE_T, 3/2><<<grid, block>>>(src_d, dst_d, 00708 width, height, depth); 00709 break; 00710 case 5: 00711 convolution_z_kernel_s<IMAGE_T, 5/2><<<grid, block>>>(src_d, dst_d, 00712 width, height, depth); 00713 break; 00714 case 7: 00715 convolution_z_kernel_s<IMAGE_T, 7/2><<<grid, block>>>(src_d, dst_d, 00716 width, height, depth); 00717 break; 00718 case 9: 00719 convolution_z_kernel_s<IMAGE_T, 9/2><<<grid, block>>>(src_d, dst_d, 00720 width, height, depth); 00721 break; 00722 case 11: 00723 convolution_z_kernel_s<IMAGE_T, 11/2><<<grid, block>>>(src_d, dst_d, 00724 width, height, depth); 00725 break; 00726 case 13: 00727 convolution_z_kernel_s<IMAGE_T, 13/2><<<grid, block>>>(src_d, dst_d, 00728 width, height, depth); 00729 break; 00730 case 15: 00731 convolution_z_kernel_s<IMAGE_T, 15/2><<<grid, block>>>(src_d, dst_d, 00732 width, height, depth); 00733 break; 00734 case 17: 00735 convolution_z_kernel_s<IMAGE_T, 17/2><<<grid, block>>>(src_d, dst_d, 00736 width, height, depth); 00737 break; 00738 case 19: 00739 convolution_z_kernel_s<IMAGE_T, 19/2><<<grid, block>>>(src_d, dst_d, 00740 width, height, depth); 00741 break; 00742 case 21: 00743 convolution_z_kernel_s<IMAGE_T, 21/2><<<grid, block>>>(src_d, dst_d, 00744 width, height, depth); 00745 break; 00746 default: 00747 convolution_z_kernel<IMAGE_T><<<grid, block>>>(src_d, dst_d, 00748 kernel_size/2, width, height, depth); 00749 break; 00750 } 00751 00752 CUERR // check and clear any existing errors 00753 } 00754 00755