1 /*
2 * This file is part of FFmpeg.
3 *
4 * FFmpeg is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public
6 * License as published by the Free Software Foundation; either
7 * version 2.1 of the License, or (at your option) any later version.
8 *
9 * FFmpeg is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with FFmpeg; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17 */
19
26
32
33 // TODO:
34 // the integral image may overflow 32bit, consider using 64bit
35
40 };
41
43 {
45
48 return 1;
49 return 0;
50 }
51
62 cl_mem
overflow;
// overflow in integral image?
73
75 {
77 cl_int cle;
78 int err;
80
82 if (!(
ctx->research_size & 1)) {
83 ctx->research_size |= 1;
85 "research_size should be odd, set to %d",
87 }
88
89 if (!(
ctx->patch_size & 1)) {
92 "patch_size should be odd, set to %d",
94 }
95
96 if (!
ctx->research_size_uv)
97 ctx->research_size_uv =
ctx->research_size;
98 if (!
ctx->patch_size_uv)
99 ctx->patch_size_uv =
ctx->patch_size;
100
102 if (err < 0)
104
105 ctx->command_queue = clCreateCommandQueue(
ctx->ocf.hwctx->context,
106 ctx->ocf.hwctx->device_id,
107 0, &cle);
109 "command queue %d.\n", cle);
110
111 ctx->vert_kernel = clCreateKernel(
ctx->ocf.program,
112 "vert_sum", &cle);
114 "vert_sum kernel %d.\n", cle);
115
116 ctx->horiz_kernel = clCreateKernel(
ctx->ocf.program,
117 "horiz_sum", &cle);
119 "horiz_sum kernel %d.\n", cle);
120
121 ctx->accum_kernel = clCreateKernel(
ctx->ocf.program,
122 "weight_accum", &cle);
124 "accum kernel %d.\n", cle);
125
126 ctx->average_kernel = clCreateKernel(
ctx->ocf.program,
127 "average", &cle);
129 "average kernel %d.\n", cle);
130
131 ctx->integral_img = clCreateBuffer(
ctx->ocf.hwctx->context, 0,
135 "integral image %d.\n", cle);
136
137 ctx->weight = clCreateBuffer(
ctx->ocf.hwctx->context, 0,
138 weight_buf_size,
NULL, &cle);
140 "weight buffer %d.\n", cle);
141
142 ctx->sum = clCreateBuffer(
ctx->ocf.hwctx->context, 0,
143 weight_buf_size,
NULL, &cle);
145 "sum buffer %d.\n", cle);
146
147 ctx->overflow = clCreateBuffer(
ctx->ocf.hwctx->context, 0,
148 sizeof(cl_int),
NULL, &cle);
150 "overflow buffer %d.\n", cle);
151
152 ctx->initialised = 1;
153 return 0;
154
160
165
167 return err;
168 }
169
172 {
174 const float zero = 0.0f;
175 const size_t worksize1[] = {
height};
176 const size_t worksize2[] = {
width};
178 int i, dx, dy, err = 0, weight_buf_size;
179 cl_int cle;
180 int nb_pixel, *
tmp =
NULL, idx = 0;
182
184 cle = clEnqueueFillBuffer(
ctx->command_queue,
ctx->weight,
185 &
zero,
sizeof(
float), 0, weight_buf_size,
188 cle);
189 cle = clEnqueueFillBuffer(
ctx->command_queue,
ctx->sum,
190 &
zero,
sizeof(
float), 0, weight_buf_size,
193 cle);
194
195 nb_pixel = (2 *
r + 1) * (2 *
r + 1) - 1;
196 dxdy =
av_malloc(nb_pixel * 2 *
sizeof(cl_int));
198
201
202 for (dx = -
r; dx <=
r; dx++) {
203 for (dy = -
r; dy <=
r; dy++) {
204 if (dx || dy) {
207 }
208 }
209 }
210 // repack dx/dy seperately, as we want to do four pairs of dx/dy in a batch
211 for (
i = 0;
i < nb_pixel / 4;
i++) {
212 dxdy[
i * 8] =
tmp[
i * 8];
// dx0
213 dxdy[
i * 8 + 1] =
tmp[
i * 8 + 2];
// dx1
214 dxdy[
i * 8 + 2] =
tmp[
i * 8 + 4];
// dx2
215 dxdy[
i * 8 + 3] =
tmp[
i * 8 + 6];
// dx3
216 dxdy[
i * 8 + 4] =
tmp[
i * 8 + 1];
// dy0
217 dxdy[
i * 8 + 5] =
tmp[
i * 8 + 3];
// dy1
218 dxdy[
i * 8 + 6] =
tmp[
i * 8 + 5];
// dy2
219 dxdy[
i * 8 + 7] =
tmp[
i * 8 + 7];
// dy3
220 }
222
223 for (
i = 0;
i < nb_pixel / 4;
i++) {
224 cl_int *dx_cur = dxdy + 8 *
i;
225 cl_int *dy_cur = dxdy + 8 *
i + 4;
226
227 // horizontal pass
228 // integral(x,y) = sum([u(v,y) - u(v+dx,y+dy)]^2) for v in [0, x]
235 cle = clEnqueueNDRangeKernel(
ctx->command_queue,
ctx->horiz_kernel, 1,
238 cle);
239 // vertical pass
240 // integral(x, y) = sum(integral(x, v)) for v in [0, y]
245 cle = clEnqueueNDRangeKernel(
ctx->command_queue,
ctx->vert_kernel,
248 cle);
249
250 // accumulate weights
261 cle = clEnqueueNDRangeKernel(
ctx->command_queue,
ctx->accum_kernel,
264 }
266
267 // average
272 cle = clEnqueueNDRangeKernel(
ctx->command_queue,
ctx->average_kernel, 2,
275 cle);
276 cle = clFlush(
ctx->command_queue);
281 if (dxdy)
283 return err;
284 }
285
287 {
296 const cl_int
zero = 0;
297 int w,
h, err, cle,
overflow, p, patch, research;
298
302
303 if (!
input->hw_frames_ctx)
307
312 }
313
315 if (err < 0)
317
318 if (!
ctx->initialised) {
325 }
328
330 if (err < 0)
332 }
333
334 cle = clEnqueueWriteBuffer(
ctx->command_queue,
ctx->overflow, CL_FALSE,
337 "detection buffer %d.\n", cle);
338
341 dst = (cl_mem)
output->data[p];
342
343 if (!dst)
344 break;
348 patch = (p ?
ctx->patch_size_uv :
ctx->patch_size) / 2;
349 research = (p ?
ctx->research_size_uv :
ctx->research_size) / 2;
351 if (err < 0)
353 }
354 // overflow occurred?
355 cle = clEnqueueReadBuffer(
ctx->command_queue,
ctx->overflow, CL_FALSE,
358
359 cle = clFinish(
ctx->command_queue);
361
364
366
370
372
374 clFinish(
ctx->command_queue);
377 return err;
378 }
379
381 {
383 cl_int cle;
384
389
394
396
398 }
399
400 #define OFFSET(x) offsetof(NLMeansOpenCLContext, x)
401 #define FLAGS (AV_OPT_FLAG_FILTERING_PARAM | AV_OPT_FLAG_VIDEO_PARAM)
409 };
410
412
414 {
419 },
420 };
421
423 {
427 },
428 };
429
431 .
name =
"nlmeans_opencl",
434 .priv_class = &nlmeans_opencl_class,
441 };