Main Page Namespace List Class Hierarchy Alphabetical List Compound List File List Namespace Members Compound Members File Members Related Pages
CUDAGaussianBlur.cu

Go to the documentation of this file.
00001 /***************************************************************************
00002 *cr
00003 *cr (C) Copyright 1995-2019 The Board of Trustees of the
00004 *cr University of Illinois
00005 *cr All Rights Reserved
00006 *cr
00007 ***************************************************************************/
00008 /***************************************************************************
00009 * RCS INFORMATION:
00010 *
00011 * $RCSfile: CUDAGaussianBlur.cu,v $
00012 * $Author: johns $ $Locker: $ $State: Exp $
00013 * $Revision: 1.18 $ $Date: 2020年02月26日 20:31:15 $
00014 *
00015 ***************************************************************************/
00024 #define WATERSHED_INTERNAL 1
00025 
00026 #include <stdio.h>
00027 #include "CUDAGaussianBlur.h"
00028 #include "Watershed.h"
00029 #include "ProfileHooks.h"
00030 #include <cuda_fp16.h>
00031 
00032 #if 0 
00033 #define CUERR { cudaError_t err; \
00034 cudaDeviceSynchronize(); \
00035 if ((err = cudaGetLastError()) != cudaSuccess) { \
00036 printf("CUDA error: %s, %s line %d\n", cudaGetErrorString(err), __FILE__, __LINE__); \
00037 }}
00038 #else
00039 #define CUERR
00040 #endif
00041 
00042 #define KERNEL_MAX_SIZE 4096
00043 
00044 __constant__ float kernel_d_c[KERNEL_MAX_SIZE];
00045 
00046 #define BLOCKDIM_X 128
00047 #define BLOCKDIM_Y 1
00048 #define BLOCKDIM_Z 1
00049 
00050 // optimized for 21 kernel size
00051 #define XCONV_BLOCKDIM_X 96
00052 #define XCONV_BLOCKDIM_Y 1
00053 #define XCONV_BLOCKDIM_Z 1
00054 
00055 #define YCONV_BLOCKDIM_X 16
00056 #define YCONV_BLOCKDIM_Y 8
00057 #define YCONV_BLOCKDIM_Z 1
00058 
00059 #define ZCONV_BLOCKDIM_X 4
00060 #define ZCONV_BLOCKDIM_Y 1
00061 #define ZCONV_BLOCKDIM_Z 32
00062 
00063 #define X_PADDING 32
00064 
00065 #define SWAPF(a, b) {\
00066 float* t = a;\
00067 a = b;\
00068 b = t;\
00069 }
00070 
00071 #define INST_GAUSSIAN_CUDA(I_T) \
00072 template void gaussian1D_x_cuda<I_T>(I_T* src_d, I_T* dst_d, int kernel_size,\
00073 int width, int height, int depth);\
00074 template void gaussian1D_y_cuda<I_T>(I_T* src_d, I_T* dst_d, int kernel_size,\
00075 int width, int height, int depth);\
00076 template void gaussian1D_z_cuda<I_T>(I_T* src_d, I_T* dst_d, int kernel_size,\
00077 int width, int height, int depth);
00078 
00079 INST_GAUSSIAN_CUDA(float)
00080 //INST_GAUSSIAN_CUDA(half)
00081 INST_GAUSSIAN_CUDA(unsigned short)
00082 INST_GAUSSIAN_CUDA(unsigned char)
00083 
00084 // Function that converts val to the appropriate type and then assigns it to arr[idx]
00085 // This is necessary for coverting an real type to integer type so we can handle rounding
00086 template <typename T>
00087 __device__ __forceinline__ void convert_type_and_assign_val(T* arr, long idx, float val) {
00088 arr[idx] = roundf(val);
00089 }
00090 
00091 template <> __device__ __forceinline__ void convert_type_and_assign_val<float>(float* arr, long idx, float val) {
00092 arr[idx] = val;
00093 }
00094 
00095 #if CUDART_VERSION >= 9000
00096 // half-precision only became mainstream with CUDA 9.x and later versions
00097 template <> __device__ __forceinline__ void convert_type_and_assign_val<half>(half* arr, long idx, float val) {
00098 arr[idx] = val;
00099 }
00100 #endif
00101 
00102 void copy_array_from_gpu(void* arr, void* arr_d, int bytes) {
00103 PROFILE_PUSH_RANGE("CUDAGaussianBlur::copy_array_from_gpu()", 5);
00104 
00105 cudaMemcpy(arr, arr_d, bytes, cudaMemcpyDeviceToHost);
00106 CUERR // check and clear any existing errors
00107 
00108 PROFILE_POP_RANGE();
00109 }
00110 
00111 
00112 void copy_array_to_gpu(void* arr_d, void* arr, int bytes) {
00113 cudaMemcpyAsync(arr_d, arr, bytes, cudaMemcpyHostToDevice);
00114 CUERR // check and clear any existing errors
00115 }
00116 
00117 
00118 void* alloc_cuda_array(int bytes) {
00119 void* t;
00120 cudaMalloc(&t, bytes);
00121 CUERR // check and clear any existing errors
00122 return t;
00123 }
00124 
00125 
00126 void free_cuda_array(void* arr) {
00127 cudaFree(arr);
00128 }
00129 
00130 
00131 void set_gaussian_1D_kernel_cuda(float* kernel, int kernel_size) {
00132 
00133 if (kernel_size > KERNEL_MAX_SIZE) {
00134 printf("Warning: exceeded maximum kernel size on GPU.\n");
00135 kernel_size = KERNEL_MAX_SIZE;
00136 }
00137 
00138 cudaMemcpyToSymbolAsync(kernel_d_c, kernel, kernel_size * sizeof(float));
00139 CUERR // check and clear any existing errors
00140 }
00141 
00142 
00143 template <typename IMAGE_T, const int offset>
00144 __global__ void convolution_x_kernel_s(const IMAGE_T* __restrict__ src_d,
00145 IMAGE_T* __restrict__ dst_d, int width, int height, int depth) {
00146 __shared__ IMAGE_T src_s[XCONV_BLOCKDIM_Z * XCONV_BLOCKDIM_Y * (XCONV_BLOCKDIM_X + offset * 2)];
00147 const int x = blockIdx.x * XCONV_BLOCKDIM_X + threadIdx.x;
00148 const int y = blockIdx.y * XCONV_BLOCKDIM_Y + threadIdx.y;
00149 const int z = blockIdx.z * XCONV_BLOCKDIM_Z + threadIdx.z;
00150 
00151 if (y >= height || z >= depth) {
00152 return;
00153 }
00154 
00155 // global idx
00156 const long idx = z * long(height) * long(width) + y * long(width) + x;
00157 
00158 // idx of first entry in row (x = 0)
00159 const long g_row_base_idx = z * long(height) * long(width) + y * long(width);
00160 
00161 // idx of first entry in shared mem array row
00162 const int t_row_base_idx = threadIdx.z * XCONV_BLOCKDIM_Y * (XCONV_BLOCKDIM_X + offset * 2)
00163 + threadIdx.y * (XCONV_BLOCKDIM_X + offset*2) + offset;
00164 
00165 // idx of first x coord in this block
00166 const int base_x = x - threadIdx.x;
00167 
00168 // Check if we need to deal with edge cases
00169 if (base_x - offset < 0 || base_x + XCONV_BLOCKDIM_X + offset >= width) {
00170 for (int i = threadIdx.x - offset; i < XCONV_BLOCKDIM_X + offset; i += XCONV_BLOCKDIM_X) {
00171 int x_offset = base_x + i;
00172 if (x_offset < 0) // left edge case
00173 x_offset = 0;
00174 else if (x_offset >= width) // right edge case
00175 x_offset = width-1;
00176 src_s[t_row_base_idx + i] = src_d[g_row_base_idx + x_offset];
00177 }
00178 } else {
00179 for (int i = threadIdx.x - offset; i < XCONV_BLOCKDIM_X + offset; i += XCONV_BLOCKDIM_X) {
00180 int x_offset = base_x + i;
00181 src_s[t_row_base_idx + i] = src_d[g_row_base_idx + x_offset];
00182 }
00183 }
00184 if (x >= width)
00185 return;
00186 
00187 // XXX is this safe to do with early returns on all GPUs?
00188 __syncthreads();
00189 
00190 const IMAGE_T* src_s_offset = src_s + t_row_base_idx + threadIdx.x;
00191 const float* kernel_offset = kernel_d_c + offset;
00192 float value = 0.0f;
00193 #pragma unroll
00194 for (int i = -offset; i <= offset; ++i) {
00195 value += src_s_offset[i] * kernel_offset[i];
00196 }
00197 
00198 convert_type_and_assign_val(dst_d, idx, value);
00199 }
00200 
00201 
00202 
00203 template <typename IMAGE_T, const int offset>
00204 __global__ void convolution_y_kernel_s(const IMAGE_T* __restrict__ src_d,
00205 IMAGE_T* __restrict__ dst_d, int width, int height, int depth) {
00206 __shared__ IMAGE_T src_s[YCONV_BLOCKDIM_Z * (YCONV_BLOCKDIM_Y + 2*offset) * YCONV_BLOCKDIM_X];
00207 const int x = blockIdx.x * YCONV_BLOCKDIM_X + threadIdx.x;
00208 const int y = blockIdx.y * YCONV_BLOCKDIM_Y + threadIdx.y;
00209 const int z = blockIdx.z * YCONV_BLOCKDIM_Z + threadIdx.z;
00210 
00211 if (x >= width || z >= depth) {
00212 return;
00213 }
00214 
00215 // global idx
00216 const long idx = z * long(height) * long(width) + y * long(width) + x;
00217 
00218 // idx of first entry in column (y = 0)
00219 const long g_col_base_idx = z * long(height) * long(width) + x;
00220 
00221 // idx of first entry in shared mem array column
00222 const int t_col_base_idx = threadIdx.z * (YCONV_BLOCKDIM_Y +offset*2) * YCONV_BLOCKDIM_X
00223 + threadIdx.x + offset * YCONV_BLOCKDIM_X;
00224 
00225 // idx of first y coord in this block
00226 const int base_y = y - threadIdx.y;
00227 
00228 // Check if we need to deal with edge cases
00229 if (base_y - offset < 0 || base_y + YCONV_BLOCKDIM_Y - 1 + offset >= height) {
00230 for (int i = threadIdx.y - offset; i < YCONV_BLOCKDIM_Y + offset; i += YCONV_BLOCKDIM_Y) {
00231 int y_offset = base_y + i;
00232 if (y_offset < 0) // left edge case
00233 y_offset = 0;
00234 else if (y_offset >= height) // right edge case
00235 y_offset = height-1;
00236 src_s[t_col_base_idx + i*YCONV_BLOCKDIM_X] = src_d[g_col_base_idx + y_offset*width];
00237 }
00238 } else {
00239 for (int i = threadIdx.y - offset; i < YCONV_BLOCKDIM_Y + offset; i += YCONV_BLOCKDIM_Y) {
00240 int y_offset = base_y + i;
00241 src_s[t_col_base_idx + i*YCONV_BLOCKDIM_X] = src_d[g_col_base_idx + y_offset*width];
00242 }
00243 }
00244 if (y >= height)
00245 return;
00246 
00247 // XXX is this safe to do with early returns on all GPUs?
00248 __syncthreads();
00249 
00250 const IMAGE_T* src_s_offset = src_s + t_col_base_idx + threadIdx.y * YCONV_BLOCKDIM_X;
00251 const float* kernel_offset = kernel_d_c + offset;
00252 float value = 0.0f;
00253 #pragma unroll
00254 for (int i = -offset; i <= offset; ++i) {
00255 value += src_s_offset[i*YCONV_BLOCKDIM_X] * kernel_offset[i];
00256 }
00257 
00258 convert_type_and_assign_val(dst_d, idx, value);
00259 }
00260 
00261 
00262 template <typename IMAGE_T, const int offset>
00263 __global__ void convolution_z_kernel_s(const IMAGE_T* __restrict__ src_d,
00264 IMAGE_T* __restrict__ dst_d, int width, int height, int depth) {
00265 __shared__ IMAGE_T src_s[ZCONV_BLOCKDIM_Z * (ZCONV_BLOCKDIM_Y + 2*offset) * ZCONV_BLOCKDIM_X];
00266 const int x = blockIdx.x * ZCONV_BLOCKDIM_X + threadIdx.x;
00267 const int y = blockIdx.y * ZCONV_BLOCKDIM_Y + threadIdx.y;
00268 const int z = blockIdx.z * ZCONV_BLOCKDIM_Z + threadIdx.z;
00269 
00270 if (x >= width || y >= height) {
00271 return;
00272 }
00273 
00274 // global idx
00275 const long idx = z * long(height) * long(width) + y * long(width) + x;
00276 
00277 // idx of first entry in column (z = 0)
00278 const long g_base_col_idx = y * long(width) + x;
00279 
00280 // idx of first entry in shared mem array column
00281 const int t_base_col_idx = threadIdx.y * ZCONV_BLOCKDIM_X 
00282 + threadIdx.x + offset * ZCONV_BLOCKDIM_X;
00283 const int base_z = z - threadIdx.z;
00284 const long heightWidth = long(height) * long(width);
00285 float value = 0.0f;
00286 
00287 // Check if we need to deal with edge cases
00288 if (base_z - offset < 0 || base_z + ZCONV_BLOCKDIM_Z - 1 + offset >= depth) {
00289 for (int i = threadIdx.z - offset; i < ZCONV_BLOCKDIM_Z + offset; i += ZCONV_BLOCKDIM_Z) {
00290 int z_offset = base_z + i;
00291 if (z_offset < 0) // left edge case
00292 z_offset = 0;
00293 else if (z_offset >= depth) // right edge case
00294 z_offset = depth-1;
00295 src_s[t_base_col_idx + i*ZCONV_BLOCKDIM_X*ZCONV_BLOCKDIM_Y]
00296 = src_d[g_base_col_idx + z_offset*heightWidth];
00297 }
00298 } else {
00299 for (int i = threadIdx.z - offset; i < ZCONV_BLOCKDIM_Z + offset; i += ZCONV_BLOCKDIM_Z) {
00300 int z_offset = base_z + i;
00301 src_s[t_base_col_idx + i*ZCONV_BLOCKDIM_X*ZCONV_BLOCKDIM_Y]
00302 = src_d[g_base_col_idx + z_offset*heightWidth];
00303 }
00304 }
00305 
00306 if (z >= depth)
00307 return;
00308 
00309 __syncthreads();
00310 
00311 const IMAGE_T* src_s_offset = src_s + t_base_col_idx + threadIdx.z
00312 * ZCONV_BLOCKDIM_X*ZCONV_BLOCKDIM_Y;
00313 const float* kernel_offset = kernel_d_c + offset;
00314 #pragma unroll
00315 for (int i = -offset; i <= offset; ++i) {
00316 value += src_s_offset[i*ZCONV_BLOCKDIM_X*ZCONV_BLOCKDIM_Y] * kernel_offset[i];
00317 }
00318 
00319 convert_type_and_assign_val(dst_d, idx, value);
00320 }
00321 
00322 
00323 template <typename IMAGE_T, const int offset>
00324 __global__ void convolution_x_kernel(const IMAGE_T* __restrict__ src_d,
00325 IMAGE_T* __restrict__ dst_d, int width, int height, int depth) {
00326 const int x = blockIdx.x * XCONV_BLOCKDIM_X + threadIdx.x;
00327 const int y = blockIdx.y * XCONV_BLOCKDIM_Y + threadIdx.y;
00328 const int z = blockIdx.z * XCONV_BLOCKDIM_Z + threadIdx.z;
00329 
00330 if (x >= width || y >= height || z >= depth) {
00331 return;
00332 }
00333 
00334 const long idx = z * long(height) * long(width) + y * long(width) + x;
00335 const int offset_neg = x - offset >= 0 ? -offset : -x;
00336 const int offset_pos = x + offset < width ? offset : width - x - 1;
00337 const float* kernel_offset = kernel_d_c + offset;
00338 const IMAGE_T* src_idx = src_d + idx;
00339 float value = 0.0f;
00340 
00341 if (offset_neg == -offset && offset_pos == offset) {
00342 #pragma unroll
00343 for (int i = -offset; i <= offset; ++i) {
00344 value += src_idx[i] * kernel_offset[i];
00345 }
00346 } else {
00347 // Handle boundary condition
00348 for (int i = -offset; i < offset_neg; ++i) {
00349 value += src_idx[offset_neg] * kernel_offset[i];
00350 }
00351 
00352 for (int i = offset_neg; i < offset_pos; ++i) {
00353 value += src_idx[i] * kernel_offset[i];
00354 }
00355 
00356 // Handle boundary condition
00357 for (int i = offset_pos; i <= offset; ++i) {
00358 value += src_idx[offset_pos] * kernel_offset[i];
00359 }
00360 }
00361 
00362 convert_type_and_assign_val(dst_d, idx, value);
00363 }
00364 
00365 
00366 template <typename IMAGE_T>
00367 __global__ void convolution_x_kernel(const IMAGE_T* __restrict__ src_d,
00368 IMAGE_T* __restrict__ dst_d, int offset, int width, int height, int depth) {
00369 const int x = blockIdx.x * XCONV_BLOCKDIM_X + threadIdx.x;
00370 const int y = blockIdx.y * XCONV_BLOCKDIM_Y + threadIdx.y;
00371 const int z = blockIdx.z * XCONV_BLOCKDIM_Z + threadIdx.z;
00372 
00373 if (x >= width || y >= height || z >= depth) {
00374 return;
00375 }
00376 
00377 const long idx = z * long(height) * long(width) + y * long(width) + x;
00378 const int offset_neg = x - offset >= 0 ? -offset : -x;
00379 const int offset_pos = x + offset < width ? offset : width - x - 1;
00380 const float* kernel_offset = kernel_d_c + offset;
00381 const IMAGE_T* src_idx = src_d + idx;
00382 float value = 0.0f;
00383 
00384 // Handle boundary condition
00385 for (int i = -offset; i < offset_neg; ++i) {
00386 value += src_idx[offset_neg] * kernel_offset[i];
00387 }
00388 
00389 for (int i = offset_neg; i < offset_pos; ++i) {
00390 value += src_idx[i] * kernel_offset[i];
00391 }
00392 
00393 // Handle boundary condition
00394 for (int i = offset_pos; i <= offset; ++i) {
00395 value += src_idx[offset_pos] * kernel_offset[i];
00396 }
00397 
00398 convert_type_and_assign_val(dst_d, idx, value);
00399 }
00400 
00401 
00402 template <typename IMAGE_T, const int offset>
00403 __global__ void convolution_y_kernel(const IMAGE_T* __restrict__ src_d,
00404 IMAGE_T* __restrict__ dst_d, int width, int height, int depth) {
00405 const int x = blockIdx.x * YCONV_BLOCKDIM_X + threadIdx.x;
00406 const int y = blockIdx.y * YCONV_BLOCKDIM_Y + threadIdx.y;
00407 const int z = blockIdx.z * YCONV_BLOCKDIM_Z + threadIdx.z;
00408 
00409 if (x >= width || y >= height || z >= depth) {
00410 return;
00411 }
00412 
00413 const long idx = z * long(height) * long(width) + y * long(width) + x;
00414 const int offset_neg = y - offset >= 0 ? -offset : -y;
00415 const int offset_pos = y + offset < height ? offset : height - y - 1;
00416 const float* kernel_offset = kernel_d_c + offset;
00417 const IMAGE_T* src_idx = src_d + idx;
00418 float value = 0.0f;
00419 
00420 if (offset_neg == -offset && offset_pos == offset) {
00421 #pragma unroll
00422 for (int i = -offset; i <= offset; ++i) {
00423 value += src_idx[i*width] * kernel_offset[i];
00424 }
00425 } else {
00426 // Handle boundary condition
00427 for (int i = -offset; i < offset_neg; ++i) {
00428 value += src_idx[offset_neg*width] * kernel_offset[i];
00429 }
00430 
00431 for (int i = offset_neg; i < offset_pos; ++i) {
00432 value += src_idx[i*width] * kernel_offset[i];
00433 }
00434 
00435 // Handle boundary condition
00436 for (int i = offset_pos; i <= offset; ++i) {
00437 value += src_idx[offset_pos*width] * kernel_offset[i];
00438 }
00439 }
00440 
00441 convert_type_and_assign_val(dst_d, idx, value);
00442 }
00443 
00444 
00445 template <typename IMAGE_T>
00446 __global__ void convolution_y_kernel(const IMAGE_T* __restrict__ src_d,
00447 IMAGE_T* __restrict__ dst_d, int offset, int width, int height, int depth) {
00448 const int x = blockIdx.x * YCONV_BLOCKDIM_X + threadIdx.x;
00449 const int y = blockIdx.y * YCONV_BLOCKDIM_Y + threadIdx.y;
00450 const int z = blockIdx.z * YCONV_BLOCKDIM_Z + threadIdx.z;
00451 
00452 if (x >= width || y >= height || z >= depth) {
00453 return;
00454 }
00455 
00456 const long idx = z * long(height) * long(width) + y * long(width) + x;
00457 const int offset_neg = y - offset >= 0 ? -offset : -y;
00458 const int offset_pos = y + offset < height ? offset : height - y - 1;
00459 const float* kernel_offset = kernel_d_c + offset;
00460 const IMAGE_T* src_idx = src_d + idx;
00461 float value = 0.0f;
00462 
00463 // Handle boundary condition
00464 for (int i = -offset; i < offset_neg; ++i) {
00465 value += src_idx[offset_neg*width] * kernel_offset[i];
00466 }
00467 
00468 for (int i = offset_neg; i < offset_pos; ++i) {
00469 value += src_idx[i*width] * kernel_offset[i];
00470 }
00471 
00472 // Handle boundary condition
00473 for (int i = offset_pos; i <= offset; ++i) {
00474 value += src_idx[offset_pos*width] * kernel_offset[i];
00475 }
00476 
00477 convert_type_and_assign_val(dst_d, idx, value);
00478 }
00479 
00480 
00481 template <typename IMAGE_T>
00482 __global__ void convolution_z_kernel(const IMAGE_T* __restrict__ src_d,
00483 IMAGE_T* __restrict__ dst_d, int offset, int width, int height, int depth) {
00484 const int x = blockIdx.x * ZCONV_BLOCKDIM_X + threadIdx.x;
00485 const int y = blockIdx.y * ZCONV_BLOCKDIM_Y + threadIdx.y;
00486 const int z = blockIdx.z * ZCONV_BLOCKDIM_Z + threadIdx.z;
00487 
00488 if (x >= width || y >= height || z >= depth) {
00489 return;
00490 }
00491 
00492 const int idx = z * long(height) * long(width) + y * long(width) + x;
00493 const int offset_neg = z - offset >= 0 ? -offset : -z;
00494 const int offset_pos = z + offset < depth ? offset : depth - z - 1;
00495 const long heightWidth = long(height) * long(width);
00496 const float* kernel_offset = kernel_d_c + offset;
00497 const IMAGE_T* src_idx = src_d + idx;
00498 float value = 0.0f;
00499 
00500 // Handle boundary condition
00501 for (int i = -offset; i < offset_neg; ++i) {
00502 value += src_idx[offset_neg*heightWidth] * kernel_offset[i];
00503 }
00504 
00505 for (int i = offset_neg; i < offset_pos; ++i) {
00506 value += src_idx[i*heightWidth] * kernel_offset[i];
00507 }
00508 
00509 // Handle boundary condition
00510 for (int i = offset_pos; i <= offset; ++i) {
00511 value += src_idx[offset_pos*heightWidth] * kernel_offset[i];
00512 }
00513 
00514 convert_type_and_assign_val(dst_d, idx, value);
00515 }
00516 
00517 
00518 template <typename IMAGE_T, const int offset>
00519 __global__ void convolution_z_kernel(const IMAGE_T* __restrict__ src_d,
00520 IMAGE_T* __restrict__ dst_d, int width, int height, int depth) {
00521 const int x = blockIdx.x * ZCONV_BLOCKDIM_X + threadIdx.x;
00522 const int y = blockIdx.y * ZCONV_BLOCKDIM_Y + threadIdx.y;
00523 const int z = blockIdx.z * ZCONV_BLOCKDIM_Z + threadIdx.z;
00524 
00525 if (x >= width || y >= height || z >= depth) {
00526 return;
00527 }
00528 
00529 const long idx = z * long(height) * long(width) + y * long(width) + x;
00530 const int offset_neg = z - offset >= 0 ? -offset : -z;
00531 const int offset_pos = z + offset < depth ? offset : depth - z - 1;
00532 const long heightWidth = long(height) * long(width);
00533 const float* kernel_offset = kernel_d_c + offset;
00534 const IMAGE_T* src_idx = src_d + idx;
00535 float value = 0.0f;
00536 
00537 if (offset_neg == -offset && offset_pos == offset) {
00538 #pragma unroll
00539 for (int i = -offset; i <= offset; ++i) {
00540 value += src_idx[i*heightWidth] * kernel_offset[i];
00541 }
00542 } else {
00543 // Handle boundary condition
00544 for (int i = -offset; i < offset_neg; ++i) {
00545 value += src_idx[offset_neg*heightWidth] * kernel_offset[i];
00546 }
00547 
00548 for (int i = offset_neg; i < offset_pos; ++i) {
00549 value += src_idx[i*heightWidth] * kernel_offset[i];
00550 }
00551 
00552 // Handle boundary condition
00553 for (int i = offset_pos; i <= offset; ++i) {
00554 value += src_idx[offset_pos*heightWidth] * kernel_offset[i];
00555 }
00556 }
00557 
00558 
00559 convert_type_and_assign_val(dst_d, idx, value);
00560 }
00561 
00562 
00563 template <typename IMAGE_T>
00564 void gaussian1D_x_cuda(IMAGE_T* src_d, IMAGE_T* dst_d, int kernel_size,
00565 int width, int height, int depth) {
00566 long x_grid = (width + XCONV_BLOCKDIM_X - 1)/XCONV_BLOCKDIM_X;
00567 long y_grid = (height + XCONV_BLOCKDIM_Y - 1)/XCONV_BLOCKDIM_Y;
00568 long z_grid = (depth + XCONV_BLOCKDIM_Z - 1)/XCONV_BLOCKDIM_Z;
00569 dim3 block(XCONV_BLOCKDIM_X, XCONV_BLOCKDIM_Y, XCONV_BLOCKDIM_Z);
00570 dim3 grid(x_grid, y_grid, z_grid);
00571 
00572 // XXX TODO 
00573 // This comment applies to all gaussian1D_XX_cuda kernels.
00574 // We should probably figure out a few different sizes like
00575 // 7, 15, 25, 35, 47, 59, etc that are farther apart
00576 // and then for each blur round the kernel size up to the nearest
00577 // supported size. Since we are ~doubling the sigma after each round
00578 // of blurring this current scheme is inefficient as only a few are used
00579 // before falling through to the default case.
00580 // If we build the kernel using the same sigma but a larger kernel_size
00581 // it should generate the same (or slightly more accurate) results
00582 // and the perf gained of using a templated kernel is much greater
00583 // than the hit from using a slightly larger than needed kernel_size
00584 
00585 switch (kernel_size) {
00586 case 3:
00587 convolution_x_kernel_s<IMAGE_T, 3/2><<<grid, block>>>(src_d, dst_d,
00588 width, height, depth);
00589 break;
00590 case 5:
00591 convolution_x_kernel_s<IMAGE_T, 5/2><<<grid, block>>>(src_d, dst_d,
00592 width, height, depth);
00593 break;
00594 case 7:
00595 convolution_x_kernel_s<IMAGE_T, 7/2><<<grid, block>>>(src_d, dst_d,
00596 width, height, depth);
00597 break;
00598 case 9:
00599 convolution_x_kernel_s<IMAGE_T, 9/2><<<grid, block>>>(src_d, dst_d,
00600 width, height, depth);
00601 break;
00602 case 11:
00603 convolution_x_kernel_s<IMAGE_T, 11/2><<<grid, block>>>(src_d, dst_d,
00604 width, height, depth);
00605 break;
00606 case 13:
00607 convolution_x_kernel_s<IMAGE_T, 13/2><<<grid, block>>>(src_d, dst_d,
00608 width, height, depth);
00609 break;
00610 case 15:
00611 convolution_x_kernel_s<IMAGE_T, 15/2><<<grid, block>>>(src_d, dst_d,
00612 width, height, depth);
00613 break;
00614 case 17:
00615 convolution_x_kernel_s<IMAGE_T, 17/2><<<grid, block>>>(src_d, dst_d,
00616 width, height, depth);
00617 break;
00618 case 19:
00619 convolution_x_kernel_s<IMAGE_T, 19/2><<<grid, block>>>(src_d, dst_d,
00620 width, height, depth);
00621 break;
00622 case 21:
00623 convolution_x_kernel_s<IMAGE_T, 21/2><<<grid, block>>>(src_d, dst_d,
00624 width, height, depth);
00625 break;
00626 default:
00627 convolution_x_kernel<IMAGE_T><<<grid, block>>>(src_d, dst_d,
00628 kernel_size/2, width, height, depth);
00629 break;
00630 }
00631 
00632 CUERR // check and clear any existing errors
00633 }
00634 
00635 
00636 template <typename IMAGE_T>
00637 void gaussian1D_y_cuda(IMAGE_T* src_d, IMAGE_T* dst_d, int kernel_size,
00638 int width, int height, int depth) {
00639 long x_grid = (width + YCONV_BLOCKDIM_X - 1)/YCONV_BLOCKDIM_X;
00640 long y_grid = (height + YCONV_BLOCKDIM_Y - 1)/YCONV_BLOCKDIM_Y;
00641 long z_grid = (depth + YCONV_BLOCKDIM_Z - 1)/YCONV_BLOCKDIM_Z;
00642 dim3 block(YCONV_BLOCKDIM_X, YCONV_BLOCKDIM_Y, YCONV_BLOCKDIM_Z);
00643 dim3 grid(x_grid, y_grid, z_grid);
00644 
00645 switch (kernel_size) {
00646 case 3:
00647 convolution_y_kernel_s<IMAGE_T, 3/2><<<grid, block>>>(src_d, dst_d,
00648 width, height, depth);
00649 break;
00650 case 5:
00651 convolution_y_kernel_s<IMAGE_T, 5/2><<<grid, block>>>(src_d, dst_d,
00652 width, height, depth);
00653 break;
00654 case 7:
00655 convolution_y_kernel_s<IMAGE_T, 7/2><<<grid, block>>>(src_d, dst_d,
00656 width, height, depth);
00657 break;
00658 case 9:
00659 convolution_y_kernel_s<IMAGE_T, 9/2><<<grid, block>>>(src_d, dst_d,
00660 width, height, depth);
00661 break;
00662 case 11:
00663 convolution_y_kernel_s<IMAGE_T, 11/2><<<grid, block>>>(src_d, dst_d,
00664 width, height, depth);
00665 break;
00666 case 13:
00667 convolution_y_kernel_s<IMAGE_T, 13/2><<<grid, block>>>(src_d, dst_d,
00668 width, height, depth);
00669 break;
00670 case 15:
00671 convolution_y_kernel_s<IMAGE_T, 15/2><<<grid, block>>>(src_d, dst_d,
00672 width, height, depth);
00673 break;
00674 case 17:
00675 convolution_y_kernel_s<IMAGE_T, 17/2><<<grid, block>>>(src_d, dst_d,
00676 width, height, depth);
00677 break;
00678 case 19:
00679 convolution_y_kernel_s<IMAGE_T, 19/2><<<grid, block>>>(src_d, dst_d,
00680 width, height, depth);
00681 break;
00682 case 21:
00683 convolution_y_kernel_s<IMAGE_T, 21/2><<<grid, block>>>(src_d, dst_d,
00684 width, height, depth);
00685 break;
00686 default:
00687 convolution_y_kernel<IMAGE_T><<<grid, block>>>(src_d, dst_d,
00688 kernel_size/2, width, height, depth);
00689 break;
00690 }
00691 
00692 CUERR // check and clear any existing errors
00693 }
00694 
00695 
00696 template <typename IMAGE_T>
00697 void gaussian1D_z_cuda(IMAGE_T* src_d, IMAGE_T* dst_d, int kernel_size,
00698 int width, int height, int depth) {
00699 long x_grid = (width + ZCONV_BLOCKDIM_X - 1)/ZCONV_BLOCKDIM_X;
00700 long y_grid = (height + ZCONV_BLOCKDIM_Y - 1)/ZCONV_BLOCKDIM_Y;
00701 long z_grid = (depth + ZCONV_BLOCKDIM_Z - 1)/ZCONV_BLOCKDIM_Z;
00702 dim3 block(ZCONV_BLOCKDIM_X, ZCONV_BLOCKDIM_Y, ZCONV_BLOCKDIM_Z);
00703 dim3 grid(x_grid, y_grid, z_grid);
00704 
00705 switch (kernel_size) {
00706 case 3:
00707 convolution_z_kernel_s<IMAGE_T, 3/2><<<grid, block>>>(src_d, dst_d,
00708 width, height, depth);
00709 break;
00710 case 5:
00711 convolution_z_kernel_s<IMAGE_T, 5/2><<<grid, block>>>(src_d, dst_d,
00712 width, height, depth);
00713 break;
00714 case 7:
00715 convolution_z_kernel_s<IMAGE_T, 7/2><<<grid, block>>>(src_d, dst_d,
00716 width, height, depth);
00717 break;
00718 case 9:
00719 convolution_z_kernel_s<IMAGE_T, 9/2><<<grid, block>>>(src_d, dst_d,
00720 width, height, depth);
00721 break;
00722 case 11:
00723 convolution_z_kernel_s<IMAGE_T, 11/2><<<grid, block>>>(src_d, dst_d,
00724 width, height, depth);
00725 break;
00726 case 13:
00727 convolution_z_kernel_s<IMAGE_T, 13/2><<<grid, block>>>(src_d, dst_d,
00728 width, height, depth);
00729 break;
00730 case 15:
00731 convolution_z_kernel_s<IMAGE_T, 15/2><<<grid, block>>>(src_d, dst_d,
00732 width, height, depth);
00733 break;
00734 case 17:
00735 convolution_z_kernel_s<IMAGE_T, 17/2><<<grid, block>>>(src_d, dst_d,
00736 width, height, depth);
00737 break;
00738 case 19:
00739 convolution_z_kernel_s<IMAGE_T, 19/2><<<grid, block>>>(src_d, dst_d,
00740 width, height, depth);
00741 break;
00742 case 21:
00743 convolution_z_kernel_s<IMAGE_T, 21/2><<<grid, block>>>(src_d, dst_d,
00744 width, height, depth);
00745 break;
00746 default:
00747 convolution_z_kernel<IMAGE_T><<<grid, block>>>(src_d, dst_d,
00748 kernel_size/2, width, height, depth);
00749 break;
00750 }
00751 
00752 CUERR // check and clear any existing errors
00753 }
00754 
00755